Skip to content

Instantly share code, notes, and snippets.

@bradfordcp
Created September 2, 2010 19:12
Show Gist options
  • Star 30 You must be signed in to star a gist
  • Fork 7 You must be signed in to fork a gist
  • Save bradfordcp/562776 to your computer and use it in GitHub Desktop.
Save bradfordcp/562776 to your computer and use it in GitHub Desktop.
Converts a WordNet prolog file into a flat file useful for Solr synonym matching.
/**
* Based off of the Lucene prolog parser in the wordnet contrib package within the
* main Lucene project. It has been modified to remove the Lucene bits and generate
* a synonyms.txt file suitable for consumption by Solr. The idea was mentioned in
* a sidebar of the book Solr 1.4 Enterprise Search Server by Eric Pugh.
*
* @see <a href="http://lucene.apache.org/java/2_3_2/lucene-sandbox/index.html#WordNet/Synonyms">Lucene Sandbox WordNet page</a>
* @see <a href="http://svn.apache.org/repos/asf/lucene/dev/trunk/lucene/contrib/wordnet/">SVN Repository of the WordNet contrib</a>
* @see <a href="https://www.packtpub.com/solr-1-4-enterprise-search-server/book">Solr 1.4 Enterprise Search Server Book</a>
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.FileWriter;
import java.io.PrintStream;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
/**
* Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a>
* into a text file suitable for Solr synonym matching
*
* This has been tested with WordNet 3.0.
*
* <p>
* The source word is the first entry, followed by a comma separated list of synonyms
* </p>
* <p>
* While the WordNet file distinguishes groups of synonyms with
* related meanings we don't do that here.
* </p>
*
*
* @see <a href="http://www.cogsci.princeton.edu/~wn/">WordNet home page</a>
* @see <a href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb man page</a>
*/
public class Syns2Syms {
/**
*
*/
private static final PrintStream o = System.out;
/**
*
*/
private static final PrintStream err = System.err;
/**
* Takes arg of prolog file name and output file
*/
public static void main(String[] args) throws Throwable {
// get command line arguments
String prologFilename = null; // name of file "wn_s.pl"
String outputFilename = null;
if (args.length == 2) {
prologFilename = args[0];
outputFilename = args[1];
}
else {
usage();
System.exit(1);
}
// ensure that the prolog file is readable
if (! (new File(prologFilename)).canRead()) {
err.println("Error: cannot read Prolog file: " + prologFilename);
System.exit(1);
}
// ensure that the output file is writeable
if (! (new File(outputFilename)).canWrite()) {
if (! (new File(outputFilename)).createNewFile()) {
err.println("Error: cannot write output file: " + outputFilename);
System.exit(1);
}
}
o.println("Opening Prolog file " + prologFilename);
final FileInputStream fis = new FileInputStream(prologFilename);
final BufferedReader br = new BufferedReader(new InputStreamReader(fis));
String line;
// maps a word to all the "groups" it's in
final Map<String,List<String>> word2Nums = new TreeMap<String,List<String>>();
// maps a group to all the words in it
final Map<String,List<String>> num2Words = new TreeMap<String,List<String>>();
// number of rejected words
int ndecent = 0;
// status output
int mod = 1;
int row = 1;
// parse prolog file
o.println( "[1/2] Parsing " + prologFilename);
while ((line = br.readLine()) != null) {
// occasional progress
if ((++row) % mod == 0) { // periodically print out line we read in
mod *= 2;
o.println("\t" + row + " " + line + " " + word2Nums.size() + " " + num2Words.size() + " ndecent=" + ndecent);
}
// syntax check
if (! line.startsWith("s(")) {
err.println("OUCH: " + line);
System.exit(1);
}
// parse line
line = line.substring(2);
int comma = line.indexOf(',');
String num = line.substring(0, comma);
int q1 = line.indexOf('\'');
line = line.substring(q1 + 1);
int q2 = line.lastIndexOf('\'');
String word = line.substring(0, q2).toLowerCase().replace("''", "'");
// make sure is a normal word
if (! isDecent(word)) {
ndecent++;
continue; // don't store words w/ spaces
}
// 1/2: word2Nums map
// append to entry or add new one
List<String> lis = word2Nums.get(word);
if (lis == null) {
lis = new LinkedList<String>();
lis.add(num);
word2Nums.put(word, lis);
}
else {
lis.add(num);
}
// 2/2: num2Words map
lis = num2Words.get(num);
if (lis == null) {
lis = new LinkedList<String>();
lis.add(word);
num2Words.put(num, lis);
}
else
lis.add(word);
}
// close the streams
fis.close();
br.close();
// create the index
o.println( "[2/2] Building index to store synonyms, " + " map sizes are " + word2Nums.size() + " and " + num2Words.size());
index(outputFilename, word2Nums, num2Words);
}
/**
* Checks to see if a word contains only alphabetic characters by
* checking it one character at a time.
*
* @param s string to check
* @return <code>true</code> if the string is decent
*/
private static boolean isDecent(String s) {
int len = s.length();
for (int i = 0; i < len; i++) {
if (!Character.isLetter(s.charAt(i))) {
return false;
}
}
return true;
}
/**
* Forms a static text file based on the 2 maps.
*
* @param outputFileName the file where the synonyms should be created
* @param word2Nums
* @param num2Words
*/
private static void index(String outputFileName, Map<String,List<String>> word2Nums, Map<String,List<String>> num2Words) throws Throwable {
int row = 0;
int mod = 1;
o.println("Opening output file");
FileWriter output_writer = new FileWriter(outputFileName);
try {
Iterator<String> i1 = word2Nums.keySet().iterator();
while (i1.hasNext()) { // for each word
String g = i1.next();
StringBuilder builder = new StringBuilder();
builder.append(g);
int n = index(word2Nums, num2Words, g, builder);
if (n > 0) {
//doc.add( new Field( F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED)); // Add root word
if ((++row % mod) == 0) {
o.println("\trow=" + row + "/" + word2Nums.size() + " builder= " + builder);
mod *= 2;
}
builder.append("\n");
output_writer.write(builder.toString());
} // else degenerate
}
} finally {
output_writer.close();
}
}
/**
* Given the 2 maps fills a document for 1 word.
*/
private static int index(Map<String,List<String>> word2Nums, Map<String,List<String>> num2Words, String g, StringBuilder builder) throws Throwable {
List<String> keys = word2Nums.get(g); // get list of key#'s
Iterator<String> i2 = keys.iterator();
Set<String> already = new TreeSet<String>(); // keep them sorted
// pass 1: fill up 'already' with all words
while (i2.hasNext()) { // for each key#
already.addAll(num2Words.get(i2.next())); // get list of words
}
int num = 0;
already.remove(g); // of course a word is it's own syn
Iterator<String> it = already.iterator();
while (it.hasNext()) {
String cur = it.next();
// don't store things like 'pit bull' -> 'american pit bull'
if (!isDecent(cur)) {
continue;
}
num++;
builder.append(", ");
builder.append(cur);
}
return num;
}
/**
* Usage message to aide nooblets
*/
private static void usage() {
o.println("\n\n" + "Generates the appropriate synonyms in a format for Apache Solr\nUsage: java Syns2Syms <prolog file> <output file>\nExample: java Syns2Syms prologwn/wn_s.pl synonyms.txt\n");
}
}
@shasso
Copy link

shasso commented Oct 22, 2021

I agree to last comment and solr now accounts for multiword syn mapping. Simple fix in isDecent() function to force these multiword words to be emitted rather than filtered out (which essentially what this function is doing). In my case I modified it slightly to see what those multiwords are (there are many) and always returning true so they won't be filtered out from the calling function. However, you have to deal, in solr, with several issues such as these examples from wordnet illustrate:
'tween, between (apostrohpe)
.22 caliber, .22 calibre (period is part of the word and can't be filtered out)
with_mercy (underscore needs to be converted to space--'with mercy' is a phrase)
bird-scarer (hyphenated words need to be preserved)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment