Skip to content

Instantly share code, notes, and snippets.

@rajdeokumarsingh
Forked from a34729t/RunStanfordParser.java
Last active August 29, 2015 14:06
Show Gist options
  • Save rajdeokumarsingh/c3a1c0bc249d8b27be81 to your computer and use it in GitHub Desktop.
Save rajdeokumarsingh/c3a1c0bc249d8b27be81 to your computer and use it in GitHub Desktop.
package foo;
import edu.stanford.nlp.fsm.ExactGrammarCompactor;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.NumberRangeFileFilter;
import edu.stanford.nlp.io.NumberRangesFileFilter;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.parser.ViterbiParser;
import edu.stanford.nlp.parser.KBestViterbiParser;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.util.Function;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.international.arabic.ArabicTreebankLanguagePack;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Numberer;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.Timing;
import edu.stanford.nlp.util.ScoredObject;
import java.io.*;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.*;
import java.util.zip.GZIPOutputStream;
import java.util.*;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.PTBTokenizer;
public class RunStanfordParser {
/**
*
* @param args Arg1 - full path of the stanford parser input file (englishPCFG.ser.gz), Arg2 - file to parse
* @throws Exception
*/
public static void main(String[] args) throws Exception {
// input format: data directory, and output directory
String parserFileOrUrl=args[0];
String fileToParse=args[1];
LexicalizedParser lp = new LexicalizedParser(parserFileOrUrl); // Create new parser
//lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"}); // set max sentence length if you want
// Call parser on files, and tokenize the contents
FileInputStream fstream = new FileInputStream(fileToParse);
DataInputStream in = new DataInputStream(fstream); // Get the object of DataInputStream
BufferedReader br = new BufferedReader(new InputStreamReader(in));
StringReader sr; // we need to re-read each line into its own reader because the tokenizer is over-complicated garbage
PTBTokenizer tkzr; // tokenizer object
WordStemmer ls = new WordStemmer(); // stemmer/lemmatizer object
// Read File Line By Line
String strLine;
while ((strLine = br.readLine()) != null) {
System.out.println ("Tokenizing and Parsing: "+strLine); // print current line to console
// do all the standard java over-complication to use the stanford parser tokenizer
sr = new StringReader(strLine);
tkzr = PTBTokenizer.newPTBTokenizer(sr);
List toks = tkzr.tokenize();
System.out.println ("tokens: "+toks);
Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something
// Output Option 1: Printing out various data by accessing it programmatically
// Get words, stemmed words and POS tags
ArrayList<String> words = new ArrayList();
ArrayList<String> stems = new ArrayList();
ArrayList<String> tags = new ArrayList();
// Get words and Tags
for (TaggedWord tw : parse.taggedYield()){
words.add(tw.word());
tags.add(tw.tag());
}
// Get stems
ls.visitTree(parse); // apply the stemmer to the tree
for (TaggedWord tw : parse.taggedYield()){
stems.add(tw.word());
}
// Get dependency tree
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
Collection tdl = gs.typedDependenciesCollapsed();
// And print!
System.out.println("words: "+words);
System.out.println("POStags: "+tags);
System.out.println("stemmedWordsAndTags: "+stems);
System.out.println("typedDependencies: "+tdl);
// Output Option 2: Printing out various data using TreePrint
// Various TreePrint options
// "penn", // constituency parse
// "oneline",
// rootLabelOnlyFormat,
// "words",
// "wordsAndTags", // unstemmed words and pos tags
// "dependencies", // unlabeled dependency parse
// "typedDependencies", // dependency parse
// "typedDependenciesCollapsed",
// "latexTree",
// "collocations",
// "semanticGraph"
// Print using TreePrint with various options
//TreePrint tp = new TreePrint("wordsAndTags,typedDependencies");
//tp.printTree(parse);
System.out.println(); // separate output lines
}
}
}
package foo;
//Standard Java libraries
import java.io.*;
import java.util.*;
import syntax.Protein;
import syntax.AnnotationType;
import syntax.Token;
import util.Pair;
import util.Util;
// Stanford Parser
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.PTBTokenizer; // tokenizer
import edu.stanford.nlp.process.DocumentPreprocessor; // sentence splitter
public class RunStanfordParser2 {
// Stuff related to stanford parser
DocumentPreprocessor splitter; // sentence splitter
PTBTokenizer tkzr; // tokenizer object
WordStemmer ls; // stemmer/lemmatizer object
LexicalizedParser lp; // parser object
StringReader sr;
/**
* Construct a wrapper around Stanford Parser for creating .dep and .morph files
* @param parserFileOrUrl path to serialized Stanford Parser
*/
public parserWrapper(String parserFileOrUrl){
splitter = new DocumentPreprocessor();
ls = new WordStemmer();
lp = new LexicalizedParser(parserFileOrUrl);
//lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"}); // set max sentence length if you want
}
/**
* Return a filename without the extension
* @param fullpath path to a file
* @return
*/
private static String getFileNameFromPath(String fullpath){
String filename = new File(fullpath).getName();
return filename.substring(0,filename.lastIndexOf('.'));
}
/**
* Does a file with the expected suffix exist? If not throw an exception.
* @param path
* @param filename
* @param suffix
* @throws Exception
*/
public void existsFileWithSuffix(String path, String filename, String suffix) throws Exception{
File fSS = new File(path+filename+suffix);
if( !fSS.exists() )
throw new Exception("\n"+fSS.toString()+"\n"+"missing preprocessed file with suffix -"+suffix);
}
public void processGENIAfiles(String inputPath, String outputPath) throws Exception {
// Create the output directory if it doesn't exist
if(! (new File(outputPath).exists()) ){ new File(outputPath).mkdirs(); }
// Append a slash to the paths for ease of file reading/creation
if( !outputPath.endsWith("/") ){ outputPath = outputPath+"/"; }
if( !inputPath.endsWith("/") ){ inputPath = inputPath+"/"; }
// Look at files in the input path
File dir = new File(inputPath);
FileFilter fileFilter = new FileFilter() {
public boolean accept(File file) {
return file.isFile() && file.getName().endsWith(".txt") ;
}
};
File[] files = dir.listFiles(fileFilter);
System.out.println("Processing "+files.length+" files...");
int count = 0;
int total = files.length;
for(File f:files){
String filename = getFileNameFromPath(f.toString());
System.out.println("File:"+filename);
// Check if all necessary files exist!
existsFileWithSuffix(inputPath, filename, ".txt");
existsFileWithSuffix(inputPath, filename, ".standoff");
existsFileWithSuffix(inputPath, filename, ".tagNew");
existsFileWithSuffix(inputPath, filename, ".a1");
String txtFile = inputPath+filename+".txt";
String standoffFile = inputPath+filename+".standoff";
String tagFile = inputPath+filename+".tagNew";
//String tagFile = inputPath+filename+".tag";
String a1File = inputPath+filename+".a1";
//Workflow
// 1. Load standoff file to get sentence offsets
// 2. Load the text file as a whole (for word offset calculation after the parsing step)
// 3. Run the parser loop- basically read each set of tokens in the tag file (each sentence) and do the processing there
HashMap<Integer, int[]> standoff = loadStandoffFile(standoffFile);
String fullText = Util.readFileAsString(txtFile);
parseTagFile(tagFile, standoff, fullText, outputPath, filename); // parser loop- see function for bulk of text processing
count++;
System.out.println("parsed file "+count+"/"+total);
}
}
/**
* HACK ALERT- GENIA standoff data is sometimes missing for last sentence, this hack fixes it
* @param standoff
* @param index
* @param fullText
* @return
*/
private Pair<Integer, Integer> getStandOff(HashMap<Integer, int[]> standoff, int index, String fullText){
int[] standOffArr = standoff.get(index);
int start = 0;
int stop = 0;
try {
start = standOffArr[0];
stop = standOffArr[1];
}
catch(Exception e) {
start = standoff.get(index-1)[1]+1;
stop = fullText.length();
}
return new Pair(start,stop);
}
/**
* Parse the tag file produced by the GENIA tagger, do some processing and reconcile this with the A1 file
* Then print the .dep and .morph output files
* @param path path to current document
* @param standoff standoff file for all sentences in current document
* @param fullText full text of current document
* @param proteins all entities in current document
* @param outputPath path to output files
* @param filename filename without suffix for current document
* @throws Exception
*/
private void parseTagFile(String path, HashMap<Integer, int[]> standoff, String fullText, String outputPath, String filename) throws Exception{
int sentenceCounter = 0;
try { //
// Prepare the outputfiles
File fDep = new File(outputPath+filename+".dep");
File fMorph = new File(outputPath+filename+".morph");
fDep.createNewFile();
fMorph.createNewFile();
BufferedWriter depFile = new BufferedWriter(new FileWriter(fDep));
BufferedWriter morphFile = new BufferedWriter(new FileWriter(fMorph));
// All the stuff for the specific file
File f = new File(path);
BufferedReader reader = new BufferedReader(new FileReader(f));
String line = null;
int tokenIndex = 0;
ArrayList<Token> tokens = new ArrayList();
while ((line=reader.readLine()) != null) {
// NOTE: The input file is a series of lines of tokens, with a blank space separating tokens for different sentences.
// We consume tokens until we hit a blank line, and then process the sentence.
if(!line.isEmpty()){
String elements[] = line.split("\\t"); // text, stem, pos, chunkTag, neTag
String tokText = elements[0];
String tokStem = elements[1];
String tokPOS = elements[2];
String nounChunk = elements[3];
String neChunk = elements[4];
Token tok = new Token(tokenIndex,tokText,tokStem, tokPOS, nounChunk, neChunk);
tokens.add(tok);
tokenIndex++;
}
else { // empty line- parse the sentence whose tokens were collected
//System.out.println("sentence#="+sentenceCounter);
// Parse
Pair<HashMap<Integer, Token>, ArrayList<Object[]>> pair = null;
try {
// Workflow:
// 1. Get sentence start and stop offsets
// 2. Parse sentence in file
// 3. Add GENIA tagging information to parsed tokens
// 4. Reconcile parsed sentence with all applicable entities in entity file
// 5. Print .dep and .morph files
// 6. In case of failed parse, print .morph file and empty entry in the .dep file
// Get standoff data
Pair<Integer,Integer> st = getStandOff(standoff, sentenceCounter, fullText);
int start = st.a;
int stop = st.b;
// Parse
pair = parseSentence(tokens,start,stop,fullText);
// Add GENIA info to parsed tokens- noun and named entity chunking
// NOTE: Currently tokens omitted from parse representation (prepositions, etc) are not added back in
HashMap<Integer, Token> parsedToks = pair.a;
addGENIAInfoToTokens(tokens,parsedToks);
// Print file out
printSentenceToDepAndMorph(pair,depFile,morphFile);
} catch(Exception e){
// When we cannot parse a sentence
// 1. Catch the exception and print an error message
System.err.println("Bad Parse on "+filename+".txt"+", sentence #"+(sentenceCounter+1));
// 2. Print the tokens out to the .morph file anyways
// 3. Make an empty entry in the .dep file
printSentenceToDepAndMorph(tokens,depFile,morphFile);
}
sentenceCounter++;
tokens = new ArrayList();
tokenIndex = 0;
System.out.print("."); // progress counter
}
}
System.out.println("."); // end of progress counter
// Close output files
depFile.close();
morphFile.close();
System.out.println("\t"+"created "+fDep.getPath());
System.out.println("\t"+"created "+fMorph.getPath());
}
catch(Exception e) {
System.err.println("Fatal Parse Error - skipping file "+filename+".txt"+", sentence #"+(sentenceCounter+1));
}
}
/**
* Add GENIA tagger info to parsed tokens- specifically noun and named entity chunking info.
* @param tokens
* @param wordMap
*/
private void addGENIAInfoToTokens(ArrayList<Token> tokens, HashMap<Integer, Token> wordMap) {
// NOTE: There are not the same number of objects in tokens as wordMap (parsing removes preps. and stuff from wordMap)
// thus: len(tokens) >= len(wordMap.values)
// We are creating a new wordmap object
//HashMap<Integer, Object[]> wordMap = new HashMap();
List<Object[]> words = new ArrayList(wordMap.values());
int j=0;
for(int i=0; i<tokens.size();i++){
Token tok = tokens.get(i);
j=i+1; // Dependency indices start 1, not 0, so make the token indices match
if(wordMap.containsKey(j)){
Token parsedTok = wordMap.get(j);
parsedTok.chunkTag = tok.chunkTag;
parsedTok.neTag = tok.neTag;
}
// else {
// // We add non-parsed words to the wordMap because they may contain GENIA tagging info for noun or ne chunks
// // NOTE: offsets are not calculated in the parser stage for these words
// // NOTE: -1 offset means disregard!!!
// Object[] wordArr = {tok.partOfSpeech, tok.text, tok.stem, -1, -1, tok.chunkTag, tok.neTag};
// wordMap.put(j, wordArr);
// }
}
}
/**
* Run Stanford Parser on a string (one sentence)
* NOTE: The start/stop is necessary to calculate the offsets
* @param sentence a sentence to parse
* @param allText text sentence is part of
* @param start index of sentence starting point in allText
* @param stop index of end of sentence in allText
* @params sentenceTokens for calculating individual words offsets (kind of a hack)
* @throws Exception
*/
private Pair<HashMap<Integer, Token>, ArrayList<Object[]>> parseSentence(ArrayList<Token> tok, int start, int stop, String fullText) throws Exception{
//System.out.println("start,stop=<"+start+","+stop+">");
// Create parser input from genia-tagged input
ArrayList<Word> toksW = new ArrayList();
for(Token t:tok){
toksW.add( new Word(t.text) );
}
Tree parse = (Tree) lp.apply((List)toksW); // finally, we actually get to parse something
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
//Collection tdl = gs.typedDependenciesCollapsed(); //NOTE: Using Un-collapsed dependencies
Collection tdl = gs.typedDependencies();
// Data structures
HashMap<Integer,Token> wordMap = new HashMap<Integer, Token>(); // Holds values we build with .morph ( index -> { POS tag, word, stem, start offset, end offset } )
ArrayList<Object[]> relnList = new ArrayList<Object[]>(); // For .dep, holds a arrays of form { relation, index head, index child }
// We will walk over the dependency parse, pull out the indicies, then do the same but for the stemmed parse
// Unstemmed parse
for( Iterator<TypedDependency> iter = tdl.iterator(); iter.hasNext(); ) {
TypedDependency var = iter.next();
TreeGraphNode dep = var.dep();
TreeGraphNode gov = var.gov();
// All useful information for a node in the tree
String reln = var.reln().getShortName();
int depIdx = var.dep().index();
int govIdx = var.gov().index();
Object relnArr[] = {reln, govIdx, depIdx};
relnList.add(relnArr);
Token depTok = new Token(depIdx, dep.label().tag(), dep.label().value());
Token govTok = new Token(govIdx, gov.label().tag(), gov.label().value());
wordMap.put(depIdx, depTok);
wordMap.put(govIdx, govTok);
}
// Stemmed parse
// Get stems!
ls.visitTree(parse); // apply the stemmer to the tree
gs = gsf.newGrammaticalStructure(parse);
tdl = gs.typedDependenciesCollapsed();
for( Iterator<TypedDependency> iter = tdl.iterator(); iter.hasNext(); ) {
TypedDependency var = iter.next();
TreeGraphNode dep = var.dep();
TreeGraphNode gov = var.gov();
int depIdx = dep.index();
if( wordMap.containsKey(depIdx))
wordMap.get( depIdx ).stem = dep.value();
int govIdx = gov.index();
if( wordMap.containsKey(govIdx))
wordMap.get( govIdx ).stem = gov.value();
}
calculateWordOffsets(wordMap, fullText, start, stop, toksW);
Pair<HashMap<Integer, Token>, ArrayList<Object[]>> pair = new Pair(wordMap, relnList);
return pair;
}
private void printSentenceToDepAndMorph(Pair<HashMap<Integer, Token>, ArrayList<Object[]>> pair, BufferedWriter depFile, BufferedWriter morphFile) throws IOException{
HashMap<Integer, Token> tokens = pair.a;
ArrayList<Object[]> relnList = pair.b;
// Print .morph
List keys = new ArrayList(tokens.keySet()); Collections.sort(keys); // get tokens, in order
for(Object k:keys){
int key = Integer.parseInt(k.toString());
Token tok = tokens.get(key);
morphFile.write(tok.index+"\t");
morphFile.write(tok.pos+"\t");
morphFile.write(tok.text+"\t");
morphFile.write(tok.stem+"\t");
morphFile.write(tok.startOffset+"\t");
morphFile.write(tok.endOffset+"\t");
morphFile.write(tok.chunkTag+"\t");
morphFile.write(tok.neTag+"\t");
morphFile.write(tok.isAcronym+"\t");
morphFile.write(tok.isPartialAcronymMatch+"\t");
morphFile.write(tok.isProteinAcronymMatch+"\t");
morphFile.write(tok.acronym+"\t");
morphFile.write(tok.acronymExpandedText+"\t");
morphFile.write(tok.proteinAcronym+"\t");
morphFile.write("\n");
}
morphFile.newLine();
//print .dep
for(Object[] relnArr: relnList){
String output = relnArr[0]+"\t"+relnArr[1]+"\t"+relnArr[2];
depFile.write(output+"\n");
}
depFile.newLine();
}
private void printSentenceToDepAndMorph(ArrayList<Token> toks, BufferedWriter depFile, BufferedWriter morphFile) throws IOException{
for(Token tok:toks){
morphFile.write(tok.index+"\t");
morphFile.write(tok.pos+"\t");
morphFile.write(tok.text+"\t");
morphFile.write(tok.stem+"\t");
morphFile.write(tok.startOffset+"\t");
morphFile.write(tok.endOffset+"\t");
morphFile.write(tok.chunkTag+"\t");
morphFile.write(tok.neTag+"\t");
morphFile.write(tok.isAcronym+"\t");
morphFile.write(tok.isPartialAcronymMatch+"\t");
morphFile.write(tok.isProteinAcronymMatch+"\t");
morphFile.write(tok.acronym+"\t");
morphFile.write(tok.acronymExpandedText+"\t");
morphFile.write(tok.proteinAcronym+"\t");
morphFile.write("\n");
}
morphFile.newLine();
//print emtpy .dep
depFile.newLine();
}
/**
* Load GENIA sentence splitter standoff file (shows sentence boundaries)
* @param path
* @return hashmap<sentence_number, {start,stop}>
* @throws Exception
*/
private HashMap<Integer, int[]> loadStandoffFile(String path) throws Exception {
HashMap<Integer , int[]> standoff = new HashMap();
File f = new File(path);
BufferedReader reader = new BufferedReader(new FileReader(f));
String line = null;
int sentenceCounter = 0;
while ((line=reader.readLine()) != null) {
if( !line.isEmpty() ){
String elements[] = line.split("\\t"); // format: text, stem, pos, chunkTag, neTag
int[] startstop = { Integer.parseInt(elements[0]), Integer.parseInt(elements[1]) };
standoff.put(sentenceCounter, startstop);
sentenceCounter++;
}
}
return standoff;
}
/**
* Is this a whitespace character
* @param c character
* @return truth
*/
private boolean IsWhiteSpace(char c){ if( c == '\n' || c == ' ' || c == '\r'){ return true; } else { return false; } }
/**
* Walk through the text and match each non-whitespace token in the tokenized sentence until completion.
* Some words (like prepositions and parens) aren't in the parse, so we need to make sure we can skip over them nicely and not break.
* NOTE: The whole thing is predicated on the fact that we are looping over the sentence, tokens and parse in order
* so the first word X or Y we see is the same in all 3 of these.
* @param wordMap Results of dependency parse (parseIndex-word mappings)
* @param text Block of text
* @param startIdx start of sentence
* @param stopIdx end of sentence
* @param sent0 Sentence tokens
* @throws Exception
*/
private void calculateWordOffsets(HashMap<Integer, Token> wordMap, String text, int startIdx, int stopIdx, List<Word> sent0) throws Exception {
// Parsed word data structure
Object[] keys = wordMap.keySet().toArray();
Arrays.sort(keys); // sort indicies by smallest to largest
int wordArrIdx = 0;
// get into the correct position
int offset = startIdx;
int start = 0;
text = text.substring(startIdx, stopIdx);
for(Word word : sent0){
String w = word.toString();
String originalW = edu.stanford.nlp.process.PTBTokenizer.ptbToken2Text(w);
// first trim off any leading whitespace
while( IsWhiteSpace(text.charAt(0)) ){
text = text.substring(1);
offset++; //increment the offset counter
}
// now see if our word matches
start = offset;
char firstChar = originalW.charAt(0);
if( firstChar == text.charAt(0)){
// Is this the word in the token?
if( text.startsWith(originalW, 0) ){
int len = originalW.length();
offset+=len;
text = text.substring(len);
// Is this token a word in the parse?
Token tok = wordMap.get(keys[wordArrIdx]);
String wParse = tok.text;
wParse = wParse.replaceAll("\\\\/", "/"); // java regexes are awful: "\\\\" == "\"
//System.out.println("wordmap=/"+wParse+"/ vs hasword=/"+originalW+"/"+" wordArrIdx="+wordArrIdx);
if ( wParse.equals(originalW) ){
//System.out.println("\t"+originalW+" ("+start+","+offset+")");
tok.startOffset = start;
tok.endOffset = offset;
wordArrIdx++;
if(wordArrIdx >= keys.length)
break;
}
}
} else {
System.out.println("w:"+word+", originalW"+originalW);
System.out.println("firstChar:"+firstChar+", text.charAt(0)"+text.charAt(0));
throw new Exception("unknown token");
}
}
}
/**
* Turn Stanford Parser sentence splitter output into a string (for a single sentence)
* @param s sentence list
* @return sentence as string
*/
public static String join(List<String> s) {
if (s.isEmpty()) return "";
String delimiter = " ";
Iterator<String> iter = s.iterator();
StringBuffer buffer = new StringBuffer(iter.next().toString());
while (iter.hasNext()) buffer.append(delimiter).append(iter.next());
return buffer.toString();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment