Last active
June 20, 2021 15:02
-
-
Save johnmiedema/4020deea875ce306971e to your computer and use it in GitHub Desktop.
Create an OpenNLP model for Named Entity Recognition of Book Titles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Create an OpenNLP model for Named Entity Recognition of Book Titles | |
//See tester at https://gist.github.com/johnmiedema/7e7330e1b9263267bdfc | |
package demoModelTrainer; | |
import java.io.File; | |
import java.io.FileOutputStream; | |
import java.util.Collections; | |
import opennlp.tools.namefind.NameFinderME; | |
import opennlp.tools.namefind.NameSampleDataStream; | |
import opennlp.tools.namefind.TokenNameFinderModel; | |
import opennlp.tools.util.PlainTextByLineStream; | |
import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator; | |
public class BuildModelDefaultFeatures { | |
public static void main(String[] args) { | |
//load trained data into memory | |
//titles marked up with <START> and <END> tags | |
//one sentence per line | |
File inFile = new File("titles.txt"); | |
//create NameSampleDataStream | |
//converts tagged strings from trained data into NameSample objects | |
//populated in next step | |
NameSampleDataStream nss = null; | |
try { | |
nss = new NameSampleDataStream( | |
new PlainTextByLineStream( | |
new java.io.FileReader(inFile))); | |
} | |
catch (Exception ex) { | |
System.out.println(ex.getMessage()); | |
} | |
//create "title" model | |
TokenNameFinderModel model = null; | |
int iterations = 100; | |
int cutoff = 5; | |
try { | |
model = NameFinderME.train( | |
"en", //language of the training data (relevant to tokenization) | |
"title", //type of model | |
nss, //the NameSample collection, created above | |
(AdaptiveFeatureGenerator) null, //null=use default set of feature generators for NE detection | |
Collections.<String,Object>emptyMap(), //empty, not adding additional resources to the model | |
iterations, //number of iterations before the model outputs, not important | |
cutoff); //lower bound for the number of times a feature exists before it is included in the model | |
} | |
catch (Exception ex) { | |
System.out.println(ex.getMessage()); | |
} | |
//save the model to disk | |
//used in testing and production | |
File outFile = null; | |
try { | |
outFile = new File("en-title.bin"); | |
FileOutputStream outFileStream = new FileOutputStream(outFile); | |
model.serialize(outFileStream); | |
} | |
catch (Exception ex) { | |
System.out.println(ex.getMessage()); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, it shows the errors like below..please guide
nss = new NameSampleDataStream(new PlainTextByLineStream(new java.io.FileReader(inFile)));
The constructor PlainTextByLineStream(FileReader) is undefined
model = NameFinderME**.train(**
"en", //language of the training data (relevant to tokenization)
"title", //type of model
nss, //the NameSample collection, created above
(AdaptiveFeatureGenerator) null, //null=use default set of feature generators for NE detection
Collections.<String,Object>emptyMap(), //empty, not adding additional resources to the model
iterations, //number of iterations before the model outputs, not important
cutoff)
Multiple markers at this line
- The method train(String, String, ObjectStream, TrainingParameters, TokenNameFinderFactory) in the type NameFinderME is not applicable
for the arguments (String, String, NameSampleDataStream, AdaptiveFeatureGenerator, Map<String,Object>, int, int)
- The method train(String, String, ObjectStream, TrainingParameters, TokenNameFinderFactory) in the type NameFinderME is not applicable
for the arguments (String, String, NameSampleDataStream, AdaptiveFeatureGenerator, Map<String,Object>, int, int)