Skip to content

Instantly share code, notes, and snippets.

@javajosh
Created August 17, 2011 03:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save javajosh/1150723 to your computer and use it in GitHub Desktop.
Save javajosh/1150723 to your computer and use it in GitHub Desktop.
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import com.csvreader.CsvReader;
public class Main {
static final boolean DEBUG = true;
/**
* Find out if the activity log note field mentions any known procedure.
*
* TODO: Doesn't work! There are far too many false positives! I suspect a problem with query construction.
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
long start = System.currentTimeMillis();
List<ActivityRecord> activityLog = parseActivityLog("Activity.csv");
List<String> procedureNames = parseProcedureNames("Procedures.xml");
// Now find the overlap with Lucene. We will write our index into
// memory.
Directory directory = DEBUG ? new NIOFSDirectory(new File(
"lucene_index")) : new RAMDirectory(); // allows us to use Luke
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_33);
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_33,
analyzer);
iwc.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(directory, iwc);
// Index the log notes
for (ActivityRecord activity : activityLog) {
if (activity.note.trim().isEmpty())
continue;
Document doc = new Document();
doc.add(new Field("start", activity.start, Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field("note", activity.note, Field.Store.YES,
Field.Index.ANALYZED));
// System.out.println(activity);
writer.addDocument(doc);
}
writer.close();
// Do one search per procedure name on the log notes index
IndexSearcher searcher = new IndexSearcher(directory);
QueryParser queryParser = new QueryParser(Version.LUCENE_33, "note",
analyzer);
for (String procedureName : procedureNames) {
try {
// This may actually be wrong, as strange characters may appear.
Query query = queryParser.parse(procedureName); // may throw
// parse
// exception
TopDocs rs = searcher.search(query, null, 10);
if (rs.totalHits > 0) {
Document firstHit = searcher.doc(rs.scoreDocs[0].doc);
System.out.printf("start: %s hits: %s proc: %s note: %s\n",
firstHit.getFieldable("start").stringValue(),
rs.totalHits, procedureName,
firstHit.getFieldable("note").stringValue());
}
} catch (ParseException e) {
//
}
}
System.out.printf("Duration: %sms", System.currentTimeMillis() - start);
}
}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment