Last active
August 15, 2019 20:12
-
-
Save johnmiedema/10796966 to your computer and use it in GitHub Desktop.
Tokenize content using OpenNLP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Tokenizing content using OpenNLP | |
//'Whatson' blog series at johnmiedema.com | |
//http://johnmiedema.com/?tag=whatson | |
//select tokenizer model, in this case a pre-trained model from OpenNLP | |
//custom models can be built for unique whitespace handling requirements | |
InputStream modelIn = new FileInputStream("en-token.bin"); | |
try { | |
//load the model | |
TokenizerModel model = new TokenizerModel(modelIn); | |
Tokenizer tokenizer = new TokenizerME(model); | |
//tokenize content | |
String tokens[] = tokenizer.tokenize("Dr. Lanyon sat alone over his wine. This was a hearty, healthy, dapper, red-faced gentleman, with a shock of hair prematurely white, and a boisterous and decided manner."); | |
//print tokens, see output below | |
for (int i=0; i<tokens.length; i++) { | |
String token = tokens[i].toString(); | |
System.out.println(token); | |
} | |
} | |
catch (IOException e) { | |
e.printStackTrace(); | |
} | |
finally { | |
if (modelIn != null) { | |
try { | |
modelIn.close(); | |
} | |
catch (IOException e) { | |
System.out.println(e.getMessage()); | |
} | |
} | |
} | |
//partial output, pipes added | |
//Dr. | Lanyon | sat | alone | over | his | wine | . | This | was | a | hearty | , | healthy … |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment