import javax.servlet.*; import javax.servlet.http.*; import java.sql.*; import java.util.*; import java.net.*; /** * Performs what is probably the most intersting search, the motif * search. The motif search takes a string of words from the user * and finds the set of synonyms for all of those words. Then it * finds the occurrences of each of these words, and assigns scores * to pages in the database which match the words and their synonyms. * It also gives additional weight to words which are less common. */ public class MotifSearch extends LitsearchBase { public String drawPage(HttpServletRequest request, HttpServletResponse response) throws Exception { String searchQuery = request.getParameter("query"); //If there is no query, show the search page. if (searchQuery == null) { Template t = new Template(TEMPLATE_DIR + "motifsearch.html"); return t.toString(); } searchQuery = searchQuery.trim(); String text = ""; String[] stripes = new String[2]; stripes[0] = ""; stripes[1] = ""; int curColor = 0; //Parse the query into individual words, the stem those words. ArrayList terms = new ArrayList(); while (true) { int i = searchQuery.indexOf(' '); if (i == -1) { terms.add(PorterStemmer.stem(searchQuery)); break; } terms.add(PorterStemmer.stem(searchQuery.substring(0, i))); searchQuery = searchQuery.substring(i + 1).trim(); } //Call the search algorithm to get a list of results and scores Hashtable results = doMotifSearch(terms); text += ""; text += stripes[curColor % 2] + "\n"; text += stripes[curColor++ %2] + "\n"; if (results.size() == 0) return text + "
"; text += "Found " + results.size() + " results for your query: \"" + request.getParameter("query") + "\"
TitleAuthorPage (click to view)Score
"; //Look up the full information on each of the search results. ConnectionWrapper wrapper = null; try { wrapper = DatabaseHook.getConnection(); Connection c = wrapper.connection(); Statement stmt = c.createStatement(); ResultSet rs; String query; ArrayList finalResults = new ArrayList(); Enumeration enum = results.keys(); String pageList = "-1"; while (enum.hasMoreElements()) { String key = (String) enum.nextElement(); query = "SELECT w.id, w.title, b.name, u.page, u.id AS pageid " + "FROM UniquePage u, Work w, WrittenBy b " + "WHERE u.id =" + key + " AND w.id = u.work AND b.work = w.id"; rs = stmt.executeQuery(query); if (!rs.next()) break; int score = Integer.parseInt((String) results.get(rs.getString(5))); ResultStore r = new ResultStore(rs.getInt(1), rs.getString(2), rs.getString(3), rs.getString(4), rs.getString(5), score); finalResults.add(r); } //Sort the results in order of score Collections.sort(finalResults); int max = 1; ResultStore maxRS = (ResultStore) Collections.max(finalResults); if (maxRS != null) max = maxRS.score; //Display the results in pretty tabular format for (int i = finalResults.size() - 1; i >= 0; i--) { ResultStore r = (ResultStore) finalResults.get(i); int widthGood = (int) (r.score * 60.0 / max); int widthBad = 60 - widthGood; text += stripes[curColor++ % 2]; text += "" + r.title + ""; text += "" + r.name + "\n"; text += "page " + r.page + "\n"; text += "" + " " + r.score + "\n"; } text += stripes[curColor++ % 2]; text += "Search Again\n"; text += ""; wrapper.checkIn(); return text; } catch (Exception e) { wrapper.checkIn(); throw e; } } /** * Performs the motif search. Comments in the code describe the process * Note: This function used to make much more use of "interesting" * unions and subqueries, but these were rewritten to preserve MySQL * compatibility. The query is still pretty "interesting," though. */ public Hashtable doMotifSearch(ArrayList terms) throws Exception { int numSynPerWord = 12 / terms.size(); ConnectionWrapper wrapper = null; try { wrapper = DatabaseHook.getConnection(); Connection c = wrapper.connection(); Statement stmt = c.createStatement(); ResultSet rs = null; String query; //Step 1: Find all of the words query = "SELECT id FROM Word WHERE word IN (-1"; Iterator itr = terms.iterator(); while (itr.hasNext()) { query += ", " + formatString((String) itr.next()); } query += ")"; rs = stmt.executeQuery(query); int[] roots = new int[terms.size()]; int[] freqs = new int[terms.size()]; int i = 0; while (rs.next()) { roots[i++] = rs.getInt(1); } Arrays.sort(roots); //sort roots inascending order //Step 1.5: Get the frequencies of all of the roots query = "SELECT word, count(*) score FROM PageContainsWord where word IN(-1"; for (i = 0; i < roots.length; i++) { query += ", " + roots[i]; } query += ") GROUP BY word ORDER BY word"; rs = stmt.executeQuery(query); i = 0; int maxFreq = 0; while (rs.next()) { int f = rs.getInt(1); if (f > maxFreq) maxFreq = f; freqs[i++] = f; } //Step 2: Get X synonyms from each description. String allSyn = ""; for (i = 0; i < terms.size(); i++) { if (freqs[i] > 0) { //get the frequency of each word and use to determine the num to accept query = "SELECT child FROM Syn WHERE root = " + roots[i] + " LIMIT " + numSynPerWord * maxFreq / freqs[i]; ; rs = stmt.executeQuery(query); while (rs.next()) { allSyn += rs.getString(1) + ","; } allSyn += roots[i] + ","; } } //Step 3: Do the large query, group the results by the //page they appear on, and assign each page a score //based on these results. allSyn = allSyn.substring(0, allSyn.length() - 1); query = "SELECT page, COUNT(*) AS score " + "FROM PageContainsWord WHERE word IN (" + allSyn + ") " + "GROUP BY page " + "ORDER BY score DESC, page " + "LIMIT 40"; Hashtable pageKeys = new Hashtable(); rs = stmt.executeQuery(query); while(rs.next()) { pageKeys.put(rs.getString(1), rs.getString(2)); } wrapper.checkIn(); return pageKeys; } catch (Exception e) { wrapper.checkIn(); throw e; } } class ResultStore implements Comparable { public int id; public String title; public String name; public String page; public String pageid; public int score; public ResultStore(int i, String t, String n, String p, String pi, int s) { id = i; title = t; name = n; page = p; pageid = pi; score = s; } public int compareTo(Object o) { ResultStore s = (ResultStore) o; if (s.score == score) return 0; if (s.score > score) return -1; return 1; } } }