Skip to content

Instantly share code, notes, and snippets.

@sergey-tihon
Last active May 15, 2019 15:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sergey-tihon/41d122e67ca74384f02a3aa0456ed365 to your computer and use it in GitHub Desktop.
Save sergey-tihon/41d122e67ca74384f02a3aa0456ed365 to your computer and use it in GitHub Desktop.
The sample of training custom NER model using OpenNLP.NET https://github.com/sergey-tihon/OpenNLP.NET
#load "common.fsx"
open java.nio.charset
open java.io
#I "../packages/OpenNLP.NET/lib/"
#r "opennlp-tools-1.8.4.dll"
#r "opennlp-uima-1.8.4.dll"
open opennlp.tools.util
open opennlp.tools.namefind
// The training data should contain at least 15000 sentences to create a model which performs well
let train (inputFile:string) =
let factory =
{ new InputStreamFactory with
member __.createInputStream () =
new FileInputStream(inputFile) :> InputStream }
let lineStream = new PlainTextByLineStream(factory, StandardCharsets.UTF_8)
use sampleStream = new NameSampleDataStream(lineStream)
let nameFinderFactory = new TokenNameFinderFactory()
let trainingParameters = new TrainingParameters();
//trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, "5");
//trainingParameters.put(TrainingParameters.CUTOFF_PARAM, "200");
NameFinderME.train ("en", "person", sampleStream, trainingParameters, nameFinderFactory)
let save (outputFile:string) (model:TokenNameFinderModel) =
use modelOut = new BufferedOutputStream(new FileOutputStream(outputFile))
model.serialize(modelOut)
let load (inputFile:string) =
use modelIn = new FileInputStream(inputFile)
TokenNameFinderModel(modelIn)
// http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt?revision=1245855&view=markup
let newModel = train <| Common.Data.openNLP.``my-model.train``
newModel |> save Common.Data.openNLP.``my-model.bin``
let model = load Common.Data.openNLP.``my-model.bin``
open opennlp.tools.tokenize
let tokenizer =
let file = Common.PaketFiles.``opennlp.sourceforge.net``.``en-token.bin``
use modelIn = new FileInputStream(file)
let model = TokenizerModel(modelIn)
TokenizerME(model)
let sentence = tokenizer.tokenize("Hi Sergey Tihon, it's your NER model.")
let nameFinder = new NameFinderME(model)
let spans = nameFinder.find(sentence)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment