import java.sql.*; import java.util.*; import java.io.*; public class CreateThesaurus { public static final String url = "[url omitted]"; public static final String username = "[username omitted]"; public static final String passwd = "[password omitted]"; public static final String database = "[database omitted]"; public static final String driver = "org.gjt.mm.mysql.Driver"; private Connection con; public void addRelationships(String file, int skip) { BufferedReader in; Statement stmt; try { in = new BufferedReader(new FileReader(file)); stmt = con.createStatement(); } catch (Exception e) { e.printStackTrace(System.err); return;} String line; for (int i = 0; i < skip; i++) { try { in.readLine(); } catch (Exception e) {} } while (true) { try { line = in.readLine(); } catch (Exception e) { e.printStackTrace(System.err); continue; } if (line == null) break; StringTokenizer tok = new StringTokenizer(line, ","); PorterStemmer s = new PorterStemmer(); String root = tok.nextToken(); if (root.indexOf(' ') >= 0) continue; String rootStem; try { rootStem = formatString(s.stem(root)); } catch (Exception e) { continue; } System.err.println("Indexing: " + rootStem); while (tok.hasMoreTokens()) { try { String child = tok.nextToken(); if (child.indexOf(' ') >= 0) continue; String childStem = formatString(child); int childID = 0; int rootID = 0; ResultSet rs = stmt.executeQuery ("SELECT id FROM Word WHERE word=" + rootStem); if (rs.next()) rootID = rs.getInt(1); else continue; rs = stmt.executeQuery ("SELECT id FROM Word WHERE word=" + childStem); if (rs.next()) childID = rs.getInt(1); else continue; rs = stmt.executeQuery ("SELECT * FROM Syn, Word w1, Word w2" + " WHERE w1.word="+ rootStem + " AND w2.word=" + childStem + " AND (Syn.root=w1.id and Syn.child=w2.id" + " OR Syn.root=w2.id and Syn.child=w1.id)"); if (!rs.next()) { System.out.println ("INSERT INTO Syn VALUES (" + rootID + ", " + childID + ")"); stmt.executeUpdate ("INSERT INTO Syn VALUES (" + rootID + ", " + childID + ")"); } } catch (Exception e) { e.printStackTrace(System.err); } } } try { in.close(); } catch (IOException e) {} } public void addWords(String file) { int sequence = 0; BufferedReader in; Statement stmt; try { in = new BufferedReader(new FileReader(file)); stmt = con.createStatement(); } catch (Exception e) { e.printStackTrace(System.err); return;} String line; while (true) { try { line = in.readLine(); if (line == null) break; int cidx = line.indexOf(','); if (cidx > 0) { String word = line.substring(0, cidx); if (word.indexOf(' ') < 0) { ResultSet rs = stmt.executeQuery ("SELECT * FROM Word WHERE word=" + formatString(PorterStemmer.stem(word))); if (!rs.next()) { String update = "INSERT INTO Word VALUES (" + sequence + ", " + formatString(PorterStemmer.stem(word)) + ")"; sequence++; stmt.executeUpdate(update); } else { //System.err.println(rs.getString(1) + " " + //rs.getString(2)); } } } if (sequence % 1000 == 1) System.err.println("Done: " + sequence); } catch (Exception e) { e.printStackTrace(System.err); } } try { in.close(); } catch (IOException e) {} } public CreateThesaurus() { try { Class.forName(driver); con = DriverManager.getConnection(url, username, passwd); } catch (Exception e) { e.printStackTrace(System.err); } } public String formatString(String s) { if (s == null) return "null"; StringBuffer buf = new StringBuffer(s); for (int i = 0; i < buf.length(); i++) { if (buf.charAt(i) == '\'') { buf.insert(i, '\''); i++; } } buf.insert(0, '\''); buf.append('\''); return buf.toString(); } public static void main(String[] arg) { if (arg.length < 2) { System.err.println("Usage: java CreateThesaurus [filename]"); } String file = arg[0]; int skip = Integer.parseInt(arg[1]); CreateThesaurus ct = new CreateThesaurus(); ct.addRelationships(file, skip); //ct.addWords(file); } }