Created
February 13, 2018 17:50
-
-
Save Jawn78/632f87e28ae92bf5574011ab1fd4b340 to your computer and use it in GitHub Desktop.
This is a project using Apache Lucene, and Apache Tika to extract document information and prep for entity recognition
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package rex1nlp; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import opennlp.tools.tokenize.TokenizerME; | |
import opennlp.tools.tokenize.TokenizerModel; | |
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.document.StringField; | |
import org.apache.lucene.document.TextField; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.index.IndexWriterConfig; | |
import org.apache.lucene.queryparser.classic.QueryParser; | |
import org.apache.lucene.queryparser.flexible.standard.parser.ParseException; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.Query; | |
import org.apache.lucene.search.ScoreDoc; | |
import org.apache.lucene.search.TopDocs; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.RAMDirectory; | |
import org.apache.tika.exception.TikaException; | |
import org.apache.tika.metadata.Metadata; | |
import org.apache.tika.parser.AutoDetectParser; | |
import org.apache.tika.parser.ParseContext; | |
import org.apache.tika.parser.Parser; | |
import org.apache.tika.sax.BodyContentHandler; | |
import org.xml.sax.ContentHandler; | |
import org.xml.sax.SAXException; | |
public class luceneRex { | |
public static void main(String[] args) throws IOException, ParseException, org.apache.lucene.queryparser.classic.ParseException, TikaException, SAXException { | |
InputStream inputStreamTokenizer = new | |
FileInputStream("C:\\Users\\RexPC\\Documents\\Programming\\Apache OpenNLP\\Models\\Original OpenNLP Models\\en-token.bin"); | |
TokenizerModel tokenModel = new TokenizerModel(inputStreamTokenizer); | |
//Instantiating the TokenizerME class | |
TokenizerME tokenizer = new TokenizerME(tokenModel); | |
String target = "C:\\Users\\RexPC\\Documents\\Haily.docx"; | |
File document = new File(target); | |
Parser parser = new AutoDetectParser(); | |
ContentHandler handler = new BodyContentHandler(); | |
Metadata metadata = new Metadata(); | |
parser.parse(new FileInputStream(document), handler, metadata, new ParseContext()); | |
// 0. Specify the analyzer for tokenizing text. | |
// The same analyzer should be used for indexing and searching | |
StandardAnalyzer analyzer = new StandardAnalyzer(); | |
// 1. create the index | |
Directory index = new RAMDirectory(); | |
IndexWriterConfig config = new IndexWriterConfig(analyzer); | |
try (IndexWriter w = new IndexWriter(index, config)) { | |
addDoc(w, handler.toString(), "193398817"); | |
// System.out.println(handler.toString()); | |
} | |
// 2. query | |
String querystr = args.length > 0 ? args[0] : "Cigna"; | |
// the "title" arg specifies the default field to use | |
// when no field is explicitly specified in the query. | |
Query q = new QueryParser("title", analyzer).parse(querystr); | |
// 3. search | |
int hitsPerPage = 10; | |
try (IndexReader reader = DirectoryReader.open(index)) { | |
IndexSearcher searcher = new IndexSearcher(reader); | |
TopDocs docs = searcher.search(q, hitsPerPage); | |
ScoreDoc[] hits = docs.scoreDocs; | |
// 4. display results | |
System.out.println("Found " + hits.length + " hits."); | |
for (int i = 0; i<hits.length; ++i) { | |
int docId = hits[i].doc; | |
Document d = searcher.doc(docId); | |
System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title")); | |
} | |
// reader can only be closed when there | |
// is no need to access the documents any more. | |
} | |
} | |
private static void addDoc(IndexWriter w, String title, String isbn) throws IOException { | |
Document doc = new Document(); | |
doc.add(new TextField("title", title, Field.Store.YES)); | |
// use a string field for isbn because we don't want it tokenized | |
doc.add(new StringField("isbn", isbn, Field.Store.YES)); | |
w.addDocument(doc); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a code snippet of a project I am working on to build my own resume parser. This uses Apache Tika to extract and tokenize document input stream which is then used by Lucene to re-display and prep for entity recognition. This is an attempt for me to borrow some code and test the application of using Lucene in place of other natural language processing tools such as OpenNLP