-
-
Save johnmiedema/4020deea875ce306971e to your computer and use it in GitHub Desktop.
//Create an OpenNLP model for Named Entity Recognition of Book Titles | |
//See tester at https://gist.github.com/johnmiedema/7e7330e1b9263267bdfc | |
package demoModelTrainer; | |
import java.io.File; | |
import java.io.FileOutputStream; | |
import java.util.Collections; | |
import opennlp.tools.namefind.NameFinderME; | |
import opennlp.tools.namefind.NameSampleDataStream; | |
import opennlp.tools.namefind.TokenNameFinderModel; | |
import opennlp.tools.util.PlainTextByLineStream; | |
import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator; | |
public class BuildModelDefaultFeatures { | |
public static void main(String[] args) { | |
//load trained data into memory | |
//titles marked up with <START> and <END> tags | |
//one sentence per line | |
File inFile = new File("titles.txt"); | |
//create NameSampleDataStream | |
//converts tagged strings from trained data into NameSample objects | |
//populated in next step | |
NameSampleDataStream nss = null; | |
try { | |
nss = new NameSampleDataStream( | |
new PlainTextByLineStream( | |
new java.io.FileReader(inFile))); | |
} | |
catch (Exception ex) { | |
System.out.println(ex.getMessage()); | |
} | |
//create "title" model | |
TokenNameFinderModel model = null; | |
int iterations = 100; | |
int cutoff = 5; | |
try { | |
model = NameFinderME.train( | |
"en", //language of the training data (relevant to tokenization) | |
"title", //type of model | |
nss, //the NameSample collection, created above | |
(AdaptiveFeatureGenerator) null, //null=use default set of feature generators for NE detection | |
Collections.<String,Object>emptyMap(), //empty, not adding additional resources to the model | |
iterations, //number of iterations before the model outputs, not important | |
cutoff); //lower bound for the number of times a feature exists before it is included in the model | |
} | |
catch (Exception ex) { | |
System.out.println(ex.getMessage()); | |
} | |
//save the model to disk | |
//used in testing and production | |
File outFile = null; | |
try { | |
outFile = new File("en-title.bin"); | |
FileOutputStream outFileStream = new FileOutputStream(outFile); | |
model.serialize(outFileStream); | |
} | |
catch (Exception ex) { | |
System.out.println(ex.getMessage()); | |
} | |
} | |
} |
Can you explain how to write model file?
Hi, it shows the errors like below..please guide
nss = new NameSampleDataStream(new PlainTextByLineStream(new java.io.FileReader(inFile)));
The constructor PlainTextByLineStream(FileReader) is undefined
model = NameFinderME**.train(**
"en", //language of the training data (relevant to tokenization)
"title", //type of model
nss, //the NameSample collection, created above
(AdaptiveFeatureGenerator) null, //null=use default set of feature generators for NE detection
Collections.<String,Object>emptyMap(), //empty, not adding additional resources to the model
iterations, //number of iterations before the model outputs, not important
cutoff)
Multiple markers at this line
- The method train(String, String, ObjectStream, TrainingParameters, TokenNameFinderFactory) in the type NameFinderME is not applicable
for the arguments (String, String, NameSampleDataStream, AdaptiveFeatureGenerator, Map<String,Object>, int, int)
- The method train(String, String, ObjectStream, TrainingParameters, TokenNameFinderFactory) in the type NameFinderME is not applicable
for the arguments (String, String, NameSampleDataStream, AdaptiveFeatureGenerator, Map<String,Object>, int, int)
Hi, i am trying to reproduce your poc. I created a model file. Below is my sample model file.
Ajaya
Hungry Tide
Nagas .
But when i test this name returned by TokenNameFinder includes all the token name. And it is not retrieving specific titles i have specified in sample train file. Can you suggest anything around this.