Skip to content

Instantly share code, notes, and snippets.

@spatzle
Created July 25, 2011 17:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save spatzle/1104702 to your computer and use it in GitHub Desktop.
Save spatzle/1104702 to your computer and use it in GitHub Desktop.
Opennlp-servlet
package com.example.opennlp;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Map;
import java.util.TreeSet;
import javax.servlet.*;
import javax.servlet.http.*;
import com.google.gson.Gson;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.namefind.DictionaryNameFinder;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinder;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.Span;
import opennlp.tools.util.StringList;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.TokenizerME;
/**
* This is a servlet interface to the Opennlp maxent classifier.
*
* @author Joyce Chan 2011
**/
public class OpennlpServlet extends HttpServlet {
private TokenNameFinder f_nameFinder;
private TokenNameFinder r_nameFinder;
private TokenizerME tokenizer ;
private TokenizerME tokenizer_internal_use ;
private TreeSet<String> stopwords = new TreeSet<String>();
private final String ENGLISH_STOP_WORDS[] = {
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
private Dictionary r_dictionary;
private static final long serialVersionUID = 1L;
private TokenizerME getTokenizer(String tokenizer_title) throws ServletException{
String title;
String location;
TokenizerME tokenizer = null;
// get default classifier
title = getServletConfig().getInitParameter(tokenizer_title);
if (title == null || title.trim().equals("")) throw new ServletException("Default Tokenizer not given.");
location = getServletConfig().getInitParameter(title);
if (location == null || location.trim().equals("")) throw new ServletException("Tokenizer location not given.");
InputStream stream = getServletConfig().getServletContext().getResourceAsStream(location);
if (stream == null) throw new ServletException("File not found. Filename = " + location);
try {
tokenizer = new TokenizerME(new TokenizerModel( new BufferedInputStream(stream)));
} catch (IOException e) {
try{
stream = new FileInputStream(location);
}catch(IOException e2){
throw new ServletException("IO problem reading tokenizer A. "+location);
}
throw new ServletException("IO problem reading tokenizer B.");
}
finally {
if ( stream != null){
try { stream.close(); } catch (IOException e) { }
}
} return tokenizer;
} //end getTokenModel
// for ner
private TokenNameFinder getNameFinder(String classifier_title) throws ServletException{
String title;
String location;
TokenNameFinder nmFinder = null;
// get default classifier
title = getServletConfig().getInitParameter(classifier_title);
if (title == null || title.trim().equals("")) throw new ServletException("Default classifier not given.");
location = getServletConfig().getInitParameter(title);
if (location == null || location.trim().equals("")) throw new ServletException("Classifier location not given.");
InputStream filestream = getServletConfig().getServletContext().getResourceAsStream(location);
if (filestream == null) throw new ServletException("File not found. Filename = " + location);
try {
nmFinder = new NameFinderME(new TokenNameFinderModel( new BufferedInputStream(filestream)));
} catch (IOException e) {
throw new ServletException("IO problem reading classifier.");
} finally {
if ( filestream != null){
try { filestream.close(); } catch (IOException e) { }
}
} return nmFinder;
} //end getTokenNameFinder
private DictionaryNameFinder getDictionaryNameFinder(String filetitle) throws ServletException{
r_dictionary = new Dictionary();
String title;
String location;
title = getServletConfig().getInitParameter(filetitle);
if (title == null || title.trim().equals("")) throw new ServletException("Default classifier not given.");
location = getServletConfig().getInitParameter(title);
if (location == null || location.trim().equals("")) throw new ServletException("Classifier location not given.");
InputStream filestream = getServletConfig().getServletContext().getResourceAsStream(location);
if (filestream == null) throw new ServletException("File not found. Filename = " + location);
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new DataInputStream(filestream)));
String strLine;
while ((strLine = br.readLine()) != null) {
//change to lower case, remove commas, tokenize, remove stop words
String s[] = removeStopWordsFromSentence(tokenizer.tokenize(strLine.toLowerCase().replace(",", "")));
putInDict(r_dictionary, s);
for (int i =0;i<s.length;i++){
putInDict(r_dictionary,s[i]);
}
}
} catch (Exception e) {// Catch exception if any
System.err.println("Error: " + e.getMessage());
}finally{
if ( filestream != null){
try { filestream.close(); } catch (IOException e) { }
}
}
DictionaryNameFinder dnf = new DictionaryNameFinder(r_dictionary);
return dnf;
}
// here are 2 examples:
// a maxent trained classifier
// and a non-trained classifier that will classify from dictionary words only
// same tokenizer, you can download on the opennlp project page, the english one is ok
public void init() throws ServletException {
tokenizer = getTokenizer("default-tokenizer");
tokenizer_internal_use = getTokenizer("default-tokenizer");
// a maxent trained classifier
f_nameFinder = getNameFinder("f-classifier");
// not trained, just a text file to be used to form the dictionary
r_nameFinder = getDictionaryNameFinder("r-raw-text");
addToStopWords();
}
public void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
doPost(request, response);
}
public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
@SuppressWarnings("unchecked")
Map<String, String> reqMap = request.getParameterMap();
PrintWriter pw = response.getWriter();
String names[] = {};
if (reqMap.containsKey("sentence")){
String sentence = request.getParameter("sentence");
String tokens[] = removeStopWordsFromSentence(tokenizer.tokenize(sentence.toLowerCase().replace(",", "")));
Span spannames[] = null;
if (reqMap.containsKey("dict") && request.getParameter("dict").equals("on")){
spannames = r_nameFinder.find(tokens);
}else{
spannames = f_nameFinder.find(tokens);
}
names = Span.spansToStrings(spannames,tokens);
if (reqMap.containsKey("wt") && request.getParameter("wt").equals("json")){
printAsJson(response, pw, names);
} else printAsHtml(response, pw, names);
}
} //end doPost
// private methods
private void addToStopWords(){
for (int i=0; i< ENGLISH_STOP_WORDS.length;i++){
stopwords.add(ENGLISH_STOP_WORDS[i]);
}
}
private String[] removeStopWordsFromSentence(String[] tokens){
ArrayList<String> newTokens = new ArrayList<String>(Arrays.asList(tokens));
for (int i=0;i<newTokens.size();i++){
if (stopwords.contains(newTokens.get(i))){
newTokens.remove(i);
}
}
return (String []) newTokens.toArray (new String [newTokens.size ()]);
}
private Dictionary putInDict(Dictionary d,String s){
StringList sl = new StringList(new String[]{s});
d.put(sl);
return d;
}
private Dictionary putInDict(Dictionary d,String[] s){
StringList sl = new StringList(s);
d.put(sl);
return d;
}
// prints result as json
private void printAsJson(HttpServletResponse response, PrintWriter pw, String[] names){
Gson gson = new Gson();
String jsonOut = gson.toJson(names);
response.setContentType("application/json");
pw.println(jsonOut);
}
// this prints it as html on a bulleted list
private void printAsHtml(HttpServletResponse response,PrintWriter pw, String[] names){
response.setContentType("text/html");
pw.println("<html>");
pw.println("<head><title>Named entities</title></title>");
pw.println("<body>");
pw.println("<h1>Here are your named entities</h1>");
pw.println("<ul>");
for (int i=0;i<names.length;i++){
pw.println("<li>"+names[i]);
}
pw.println("</ ul>");
pw.println("</body></html>");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment