a34729t/RunStanfordParser.java

## RunStanfordParser.java
package foo;

import edu.stanford.nlp.fsm.ExactGrammarCompactor;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.NumberRangeFileFilter;
import edu.stanford.nlp.io.NumberRangesFileFilter;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.parser.ViterbiParser;
import edu.stanford.nlp.parser.KBestViterbiParser;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.util.Function;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.international.arabic.ArabicTreebankLanguagePack;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Numberer;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.Timing;
import edu.stanford.nlp.util.ScoredObject;

import java.io.*;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.*;
import java.util.zip.GZIPOutputStream;
import java.util.*;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.PTBTokenizer;

public class RunStanfordParser {
	/**
	 *
	 * @param args Arg1 - full path of the stanford parser input file (englishPCFG.ser.gz), Arg2 - file to parse
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {
		// input format: data directory, and output directory
		String parserFileOrUrl=args[0];
		String fileToParse=args[1];

		LexicalizedParser lp = new LexicalizedParser(parserFileOrUrl); // Create new parser
	    //lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"}); // set max sentence length if you want

		// Call parser on files, and tokenize the contents
		FileInputStream fstream = new FileInputStream(fileToParse);
		DataInputStream in = new DataInputStream(fstream); // Get the object of DataInputStream
	    BufferedReader br = new BufferedReader(new InputStreamReader(in));
	    StringReader sr; // we need to re-read each line into its own reader because the tokenizer is over-complicated garbage
	    PTBTokenizer tkzr; // tokenizer object
	    WordStemmer ls = new WordStemmer(); // stemmer/lemmatizer object

	    // Read File Line By Line
	    String strLine;
	    while ((strLine = br.readLine()) != null)   {
	    	System.out.println ("Tokenizing and Parsing: "+strLine); // print current line to console

	    	// do all the standard java over-complication to use the stanford parser tokenizer
	    	sr = new StringReader(strLine);
	    	tkzr = PTBTokenizer.newPTBTokenizer(sr);
	    	List toks = tkzr.tokenize();
	    	System.out.println ("tokens: "+toks);

		    Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something

		 // Output Option 1: Printing out various data by accessing it programmatically

	 	    // Get words, stemmed words and POS tags
			ArrayList<String> words = new ArrayList();
			ArrayList<String> stems = new ArrayList();
			ArrayList<String> tags = new ArrayList();

			// Get words and Tags
			for (TaggedWord tw : parse.taggedYield()){
				words.add(tw.word());
				tags.add(tw.tag());
			}

			// Get stems
		    ls.visitTree(parse); // apply the stemmer to the tree
			for (TaggedWord tw : parse.taggedYield()){
				stems.add(tw.word());
			}

			// Get dependency tree
			TreebankLanguagePack tlp = new PennTreebankLanguagePack();
	 	    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
	 	    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
	 	    Collection tdl = gs.typedDependenciesCollapsed();

			// And print!
		    System.out.println("words: "+words);
		    System.out.println("POStags: "+tags);
			System.out.println("stemmedWordsAndTags: "+stems);
			System.out.println("typedDependencies: "+tdl);

		 // Output Option 2: Printing out various data using TreePrint

	 		// Various TreePrint options
	 		//	    "penn", // constituency parse
	 		//	    "oneline",
	 		//	    rootLabelOnlyFormat,
	 		//	    "words",
	 		//	    "wordsAndTags", // unstemmed words and pos tags
	 		//	    "dependencies", // unlabeled dependency parse
	 		//	    "typedDependencies", // dependency parse
	 		//	    "typedDependenciesCollapsed",
	 		//	    "latexTree",
	 		//	    "collocations",
	 		//	    "semanticGraph"

			// Print using TreePrint with various options
	 	    //TreePrint tp = new TreePrint("wordsAndTags,typedDependencies");
	 	    //tp.printTree(parse);

			System.out.println(); // separate output lines
	    }

	}

}

## RunStanfordParser2.java
package foo;

//Standard Java libraries
import java.io.*;
import java.util.*;

import syntax.Protein;
import syntax.AnnotationType;
import syntax.Token;
import util.Pair;
import util.Util;

// Stanford Parser
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.PTBTokenizer; // tokenizer
import edu.stanford.nlp.process.DocumentPreprocessor; // sentence splitter


public class RunStanfordParser2 {

	// Stuff related to stanford parser
	DocumentPreprocessor splitter; // sentence splitter
	PTBTokenizer tkzr; // tokenizer object
	WordStemmer ls; // stemmer/lemmatizer object
	LexicalizedParser lp; // parser object
	StringReader sr;

	/**
	 * Construct a wrapper around Stanford Parser for creating .dep and .morph files
	 * @param parserFileOrUrl path to serialized Stanford Parser
	 */
	public parserWrapper(String parserFileOrUrl){
		splitter = new DocumentPreprocessor();
		ls = new WordStemmer();
		lp = new LexicalizedParser(parserFileOrUrl);
	    //lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"}); // set max sentence length if you want
	}

	/**
	 * Return a filename without the extension
	 * @param fullpath path to a file
	 * @return
	 */
	private static String getFileNameFromPath(String fullpath){
		String filename = new File(fullpath).getName();
		return filename.substring(0,filename.lastIndexOf('.'));
	}

	/**
	 * Does a file with the expected suffix exist? If not throw an exception.
	 * @param path
	 * @param filename
	 * @param suffix
	 * @throws Exception
	 */
	public void existsFileWithSuffix(String path, String filename, String suffix) throws Exception{
		File fSS = new File(path+filename+suffix);
		if( !fSS.exists() )
			throw new Exception("\n"+fSS.toString()+"\n"+"missing preprocessed file with suffix -"+suffix);
	}

	public void processGENIAfiles(String inputPath, String outputPath) throws Exception {
		// Create the output directory if it doesn't exist
		if(! (new File(outputPath).exists()) ){	new File(outputPath).mkdirs(); }
		// Append a slash to the paths for ease of file reading/creation
		if( !outputPath.endsWith("/") ){ outputPath = outputPath+"/"; }
		if( !inputPath.endsWith("/") ){ inputPath = inputPath+"/"; }

		// Look at files in the input path
		File dir = new File(inputPath);
		FileFilter fileFilter = new FileFilter() {
			public boolean accept(File file) {
				return file.isFile() && file.getName().endsWith(".txt") ;
			}
		};
		File[] files = dir.listFiles(fileFilter);

		System.out.println("Processing "+files.length+" files...");


		int count = 0;
		int total = files.length;
		for(File f:files){
			String filename = getFileNameFromPath(f.toString());

			System.out.println("File:"+filename);

			// Check if all necessary files exist!
			existsFileWithSuffix(inputPath, filename, ".txt");
			existsFileWithSuffix(inputPath, filename, ".standoff");
			existsFileWithSuffix(inputPath, filename, ".tagNew");
			existsFileWithSuffix(inputPath, filename, ".a1");

			String txtFile = inputPath+filename+".txt";
			String standoffFile = inputPath+filename+".standoff";
			String tagFile = inputPath+filename+".tagNew";
			//String tagFile = inputPath+filename+".tag";
			String a1File = inputPath+filename+".a1";

			//Workflow
			// 1. Load standoff file to get sentence offsets
			// 2. Load the text file as a whole (for word offset calculation after the parsing step)
			// 3. Run the parser loop- basically read each set of tokens in the tag file (each sentence) and do the processing there

			HashMap<Integer, int[]> standoff = loadStandoffFile(standoffFile);
			String fullText = Util.readFileAsString(txtFile);
			parseTagFile(tagFile, standoff, fullText, outputPath, filename); // parser loop- see function for bulk of text processing
			count++;
			System.out.println("parsed file "+count+"/"+total);
		}

	}

	/**
	 * HACK ALERT- GENIA standoff data is sometimes missing for last sentence, this hack fixes it
	 * @param standoff
	 * @param index
	 * @param fullText
	 * @return
	 */
	private Pair<Integer, Integer> getStandOff(HashMap<Integer, int[]> standoff, int index, String fullText){
		int[] standOffArr = standoff.get(index);
		int start = 0;
		int stop = 0;
		try {
			start = standOffArr[0];
			stop = standOffArr[1];
		}
		catch(Exception e) {
			start = standoff.get(index-1)[1]+1;
			stop = fullText.length();
		}

		return new Pair(start,stop);
	}

	/**
	 * Parse the tag file produced by the GENIA tagger, do some processing and reconcile this with the A1 file
	 * Then print the .dep and .morph output files
	 * @param path path to current document
	 * @param standoff standoff file for all sentences in current document
	 * @param fullText full text of current document
	 * @param proteins all entities in current document
	 * @param outputPath path to output files
	 * @param filename filename without suffix for current document
	 * @throws Exception
	 */
	private void parseTagFile(String path, HashMap<Integer, int[]> standoff, String fullText, String outputPath, String filename) throws Exception{
		int sentenceCounter = 0;
		try { //

			// Prepare the outputfiles
			File fDep = new File(outputPath+filename+".dep");
			File fMorph = new File(outputPath+filename+".morph");
			fDep.createNewFile();
			fMorph.createNewFile();
			BufferedWriter depFile = new BufferedWriter(new FileWriter(fDep));
			BufferedWriter morphFile = new BufferedWriter(new FileWriter(fMorph));

			// All the stuff for the specific file
			File f = new File(path);
			BufferedReader reader = new BufferedReader(new FileReader(f));
			String line = null;
			int tokenIndex = 0;
			ArrayList<Token> tokens = new ArrayList();
			while ((line=reader.readLine()) != null) {
				// NOTE: The input file is a series of lines of tokens, with a blank space separating tokens for different sentences.
				// We consume tokens until we hit a blank line, and then process the sentence.
				if(!line.isEmpty()){
					String elements[] = line.split("\\t"); // text, stem, pos, chunkTag, neTag
					String tokText = elements[0];
					String tokStem = elements[1];
					String tokPOS = elements[2];
					String nounChunk = elements[3];
					String neChunk = elements[4];

					Token tok = new Token(tokenIndex,tokText,tokStem, tokPOS, nounChunk, neChunk);
					tokens.add(tok);
					tokenIndex++;
				}
				else { // empty line- parse the sentence whose tokens were collected
					//System.out.println("sentence#="+sentenceCounter);

					// Parse
					Pair<HashMap<Integer, Token>, ArrayList<Object[]>> pair = null;
					try {

						// Workflow:
						// 1. Get sentence start and stop offsets
						// 2. Parse sentence in file
						// 3. Add GENIA tagging information to parsed tokens
						// 4. Reconcile parsed sentence with all applicable entities in entity file
						// 5. Print .dep and .morph files
						// 6. In case of failed parse, print .morph file and empty entry in the .dep file

						// Get standoff data
						Pair<Integer,Integer> st = getStandOff(standoff, sentenceCounter, fullText);
						int start = st.a;
						int stop = st.b;

						// Parse
						pair = parseSentence(tokens,start,stop,fullText);

						// Add GENIA info to parsed tokens- noun and named entity chunking
						// NOTE: Currently tokens omitted from parse representation (prepositions, etc) are not added back in
						HashMap<Integer, Token> parsedToks = pair.a;
						addGENIAInfoToTokens(tokens,parsedToks);

						// Print file out
						printSentenceToDepAndMorph(pair,depFile,morphFile);
					} catch(Exception e){
						// When we cannot parse a sentence
						// 1. Catch the exception and print an error message
						System.err.println("Bad Parse on "+filename+".txt"+", sentence #"+(sentenceCounter+1));
						// 2. Print the tokens out to the .morph file anyways
						// 3. Make an empty entry in the .dep file
						printSentenceToDepAndMorph(tokens,depFile,morphFile);
					}


					sentenceCounter++;
					tokens = new ArrayList();
					tokenIndex = 0;

					System.out.print("."); // progress counter
				}
			}
			System.out.println("."); // end of progress counter

			// Close output files
			depFile.close();
			morphFile.close();
			System.out.println("\t"+"created "+fDep.getPath());
			System.out.println("\t"+"created "+fMorph.getPath());
		}
		catch(Exception e) {
			System.err.println("Fatal Parse Error - skipping file "+filename+".txt"+", sentence #"+(sentenceCounter+1));
		}
	}

	/**
	 * Add GENIA tagger info to parsed tokens- specifically noun and named entity chunking info.
	 * @param tokens
	 * @param wordMap
	 */
	private void addGENIAInfoToTokens(ArrayList<Token> tokens, HashMap<Integer, Token> wordMap) {
		// NOTE: There are not the same number of objects in tokens as wordMap (parsing removes preps. and stuff from wordMap)
		// thus: len(tokens) >= len(wordMap.values)

		// We are creating a new wordmap object
		//HashMap<Integer, Object[]> wordMap = new HashMap();

		List<Object[]> words = new ArrayList(wordMap.values());
		int j=0;
		for(int i=0; i<tokens.size();i++){
			Token tok = tokens.get(i);
			j=i+1; // Dependency indices start 1, not 0, so make the token indices match
			if(wordMap.containsKey(j)){
				Token parsedTok = wordMap.get(j);
				parsedTok.chunkTag = tok.chunkTag;
				parsedTok.neTag = tok.neTag;
			}
//			else {
//				// We add non-parsed words to the wordMap because they may contain GENIA tagging info for noun or ne chunks
//				// NOTE: offsets are not calculated in the parser stage for these words
//				// NOTE: -1 offset means disregard!!!
//				Object[] wordArr = {tok.partOfSpeech, tok.text, tok.stem, -1, -1, tok.chunkTag, tok.neTag};
//				wordMap.put(j, wordArr);
//			}
		}
	}


	/**
	 * Run Stanford Parser on a string (one sentence)
	 * NOTE: The start/stop is necessary to calculate the offsets
	 * @param sentence a sentence to parse
	 * @param allText text sentence is part of
	 * @param start index of sentence starting point in allText
	 * @param stop index of end of sentence in allText
	 * @params sentenceTokens for calculating individual words offsets (kind of a hack)
	 * @throws Exception
	 */
	private Pair<HashMap<Integer, Token>, ArrayList<Object[]>> parseSentence(ArrayList<Token> tok, int start, int stop, String fullText) throws Exception{
		//System.out.println("start,stop=<"+start+","+stop+">");

		// Create parser input from genia-tagged input
		ArrayList<Word> toksW = new ArrayList();
		for(Token t:tok){
			toksW.add( new Word(t.text) );
		}

		Tree parse = (Tree) lp.apply((List)toksW); // finally, we actually get to parse something
		TreebankLanguagePack tlp = new PennTreebankLanguagePack();
		GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
		GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
		//Collection tdl = gs.typedDependenciesCollapsed(); //NOTE: Using Un-collapsed dependencies
		Collection tdl = gs.typedDependencies();
		// Data structures
		HashMap<Integer,Token> wordMap = new HashMap<Integer, Token>(); // Holds values we build with .morph ( index -> { POS tag, word, stem, start offset, end offset } )
		ArrayList<Object[]> relnList = new ArrayList<Object[]>(); // For .dep, holds a arrays of form { relation, index head, index child }

		// We will walk over the dependency parse, pull out the indicies, then do the same but for the stemmed parse

		// Unstemmed parse
		for( Iterator<TypedDependency> iter = tdl.iterator(); iter.hasNext(); ) {
			TypedDependency var = iter.next();

			TreeGraphNode dep = var.dep();
			TreeGraphNode gov = var.gov();

			// All useful information for a node in the tree
			String reln = var.reln().getShortName();
			int depIdx = var.dep().index();
			int govIdx = var.gov().index();
			Object relnArr[] = {reln, govIdx, depIdx};
			relnList.add(relnArr);

			Token depTok = new Token(depIdx, dep.label().tag(), dep.label().value());
			Token govTok = new Token(govIdx, gov.label().tag(), gov.label().value());

			wordMap.put(depIdx, depTok);
			wordMap.put(govIdx, govTok);
		}

		// Stemmed parse
		// Get stems!
		ls.visitTree(parse); // apply the stemmer to the tree
		gs = gsf.newGrammaticalStructure(parse);
		tdl = gs.typedDependenciesCollapsed();
		for( Iterator<TypedDependency> iter = tdl.iterator(); iter.hasNext(); ) {
			TypedDependency var = iter.next();
			TreeGraphNode dep = var.dep();
			TreeGraphNode gov = var.gov();

			int depIdx = dep.index();
			if( wordMap.containsKey(depIdx))
				wordMap.get( depIdx ).stem = dep.value();

			int govIdx = gov.index();
			if( wordMap.containsKey(govIdx))
				wordMap.get( govIdx ).stem = gov.value();
		}

		calculateWordOffsets(wordMap, fullText, start, stop, toksW);

		Pair<HashMap<Integer, Token>, ArrayList<Object[]>> pair = new Pair(wordMap, relnList);
		return pair;
	}

	private void printSentenceToDepAndMorph(Pair<HashMap<Integer, Token>, ArrayList<Object[]>> pair, BufferedWriter depFile, BufferedWriter morphFile) throws IOException{
		HashMap<Integer, Token> tokens = pair.a;
		ArrayList<Object[]> relnList = pair.b;

		// Print .morph
		List keys = new ArrayList(tokens.keySet()); Collections.sort(keys); // get tokens, in order
		for(Object k:keys){
			int key = Integer.parseInt(k.toString());
			Token tok = tokens.get(key);
			morphFile.write(tok.index+"\t");
			morphFile.write(tok.pos+"\t");
			morphFile.write(tok.text+"\t");
			morphFile.write(tok.stem+"\t");
			morphFile.write(tok.startOffset+"\t");
			morphFile.write(tok.endOffset+"\t");
			morphFile.write(tok.chunkTag+"\t");
			morphFile.write(tok.neTag+"\t");
			morphFile.write(tok.isAcronym+"\t");
			morphFile.write(tok.isPartialAcronymMatch+"\t");
			morphFile.write(tok.isProteinAcronymMatch+"\t");
			morphFile.write(tok.acronym+"\t");
			morphFile.write(tok.acronymExpandedText+"\t");
			morphFile.write(tok.proteinAcronym+"\t");
			morphFile.write("\n");
		}
		morphFile.newLine();

	    //print .dep
	    for(Object[] relnArr: relnList){
	    	String output = relnArr[0]+"\t"+relnArr[1]+"\t"+relnArr[2];
	    	depFile.write(output+"\n");
	    }
	    depFile.newLine();
	}

	private void printSentenceToDepAndMorph(ArrayList<Token> toks, BufferedWriter depFile, BufferedWriter morphFile) throws IOException{
		for(Token tok:toks){
			morphFile.write(tok.index+"\t");
			morphFile.write(tok.pos+"\t");
			morphFile.write(tok.text+"\t");
			morphFile.write(tok.stem+"\t");
			morphFile.write(tok.startOffset+"\t");
			morphFile.write(tok.endOffset+"\t");
			morphFile.write(tok.chunkTag+"\t");
			morphFile.write(tok.neTag+"\t");
			morphFile.write(tok.isAcronym+"\t");
			morphFile.write(tok.isPartialAcronymMatch+"\t");
			morphFile.write(tok.isProteinAcronymMatch+"\t");
			morphFile.write(tok.acronym+"\t");
			morphFile.write(tok.acronymExpandedText+"\t");
			morphFile.write(tok.proteinAcronym+"\t");
			morphFile.write("\n");
		}
		morphFile.newLine();

	    //print emtpy .dep
	    depFile.newLine();
	}

	/**
	 * Load GENIA sentence splitter standoff file (shows sentence boundaries)
	 * @param path
	 * @return hashmap<sentence_number, {start,stop}>
	 * @throws Exception
	 */
	private HashMap<Integer, int[]> loadStandoffFile(String path) throws Exception {
		HashMap<Integer , int[]> standoff = new HashMap();

		File f = new File(path);
		BufferedReader reader = new BufferedReader(new FileReader(f));
		String line = null;
		int sentenceCounter = 0;
		while ((line=reader.readLine()) != null) {
			if( !line.isEmpty() ){
				String elements[] = line.split("\\t"); // format: text, stem, pos, chunkTag, neTag
				int[] startstop = { Integer.parseInt(elements[0]), Integer.parseInt(elements[1]) };
				standoff.put(sentenceCounter, startstop);
				sentenceCounter++;
			}
		}
		return standoff;
	}


	/**
	 * Is this a whitespace character
	 * @param c character
	 * @return truth
	 */
	private boolean IsWhiteSpace(char c){ if( c == '\n' || c == ' ' || c == '\r'){ return true; } else { return false; } }

	/**
	 * Walk through the text and match each non-whitespace token in the tokenized sentence until completion.
	 * Some words (like prepositions and parens) aren't in the parse, so we need to make sure we can skip over them nicely and not break.
	 * NOTE: The whole thing is predicated on the fact that we are looping over the sentence, tokens and parse in order
	 * so the first word X or Y we see is the same in all 3 of these.
	 * @param wordMap Results of dependency parse (parseIndex-word mappings)
	 * @param text Block of text
	 * @param startIdx start of sentence
	 * @param stopIdx end of sentence
	 * @param sent0 Sentence tokens
	 * @throws Exception
	 */
	private void calculateWordOffsets(HashMap<Integer, Token> wordMap, String text, int startIdx, int stopIdx, List<Word> sent0) throws Exception {
		// Parsed word data structure
		Object[] keys = wordMap.keySet().toArray();
 	    Arrays.sort(keys); // sort indicies by smallest to largest
		int wordArrIdx = 0;

 	    // get into the correct position
		int offset = startIdx;
		int start = 0;
		text = text.substring(startIdx, stopIdx);

		for(Word word : sent0){
			String w = word.toString();
			String originalW = edu.stanford.nlp.process.PTBTokenizer.ptbToken2Text(w);

			// first trim off any leading whitespace
			while( IsWhiteSpace(text.charAt(0)) ){
				text = text.substring(1);
				offset++; //increment the offset counter
			}

			// now see if our word matches
			start = offset;
			char firstChar = originalW.charAt(0);
			if( firstChar == text.charAt(0)){
				// Is this the word in the token?
				if( text.startsWith(originalW, 0) ){
					int len = originalW.length();
					offset+=len;
					text = text.substring(len);

					// Is this token a word in the parse?
					Token tok = wordMap.get(keys[wordArrIdx]);
					String wParse = tok.text;

					wParse = wParse.replaceAll("\\\\/", "/"); // java regexes are awful: "\\\\" == "\"
					//System.out.println("wordmap=/"+wParse+"/ vs hasword=/"+originalW+"/"+" wordArrIdx="+wordArrIdx);

					if ( wParse.equals(originalW) ){
						//System.out.println("\t"+originalW+" ("+start+","+offset+")");
						tok.startOffset = start;
						tok.endOffset = offset;
						wordArrIdx++;

						if(wordArrIdx >= keys.length)
							break;
					}

				}
			} else {
				System.out.println("w:"+word+", originalW"+originalW);
				System.out.println("firstChar:"+firstChar+", text.charAt(0)"+text.charAt(0));
				throw new Exception("unknown token");

			}
		}
	}

	/**
	 * Turn Stanford Parser sentence splitter output into a string (for a single sentence)
	 * @param s sentence list
	 * @return sentence as string
	 */
	public static String join(List<String> s) {
	    if (s.isEmpty()) return "";
	    String delimiter = " ";
	    Iterator<String> iter = s.iterator();
	    StringBuffer buffer = new StringBuffer(iter.next().toString());
	    while (iter.hasNext()) buffer.append(delimiter).append(iter.next());
	    return buffer.toString();
	}

}
	package foo;

	import edu.stanford.nlp.fsm.ExactGrammarCompactor;
	import edu.stanford.nlp.io.IOUtils;
	import edu.stanford.nlp.io.NumberRangeFileFilter;
	import edu.stanford.nlp.io.NumberRangesFileFilter;
	import edu.stanford.nlp.ling.*;
	import edu.stanford.nlp.objectbank.TokenizerFactory;
	import edu.stanford.nlp.parser.ViterbiParser;
	import edu.stanford.nlp.parser.KBestViterbiParser;
	import edu.stanford.nlp.process.DocumentPreprocessor;
	import edu.stanford.nlp.util.Function;
	import edu.stanford.nlp.process.WhitespaceTokenizer;
	import edu.stanford.nlp.trees.*;
	import edu.stanford.nlp.trees.international.arabic.ArabicTreebankLanguagePack;
	import edu.stanford.nlp.util.Generics;
	import edu.stanford.nlp.util.Numberer;
	import edu.stanford.nlp.util.Pair;
	import edu.stanford.nlp.util.Timing;
	import edu.stanford.nlp.util.ScoredObject;

	import java.io.*;
	import java.text.DecimalFormat;
	import java.text.NumberFormat;
	import java.util.*;
	import java.util.zip.GZIPOutputStream;
	import java.util.*;
	import edu.stanford.nlp.trees.*;
	import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
	import edu.stanford.nlp.process.PTBTokenizer;

	public class RunStanfordParser {
	/**
	*
	* @param args Arg1 - full path of the stanford parser input file (englishPCFG.ser.gz), Arg2 - file to parse
	* @throws Exception
	*/
	public static void main(String[] args) throws Exception {
	// input format: data directory, and output directory
	String parserFileOrUrl=args[0];
	String fileToParse=args[1];

	LexicalizedParser lp = new LexicalizedParser(parserFileOrUrl); // Create new parser
	//lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"}); // set max sentence length if you want

	// Call parser on files, and tokenize the contents
	FileInputStream fstream = new FileInputStream(fileToParse);
	DataInputStream in = new DataInputStream(fstream); // Get the object of DataInputStream
	BufferedReader br = new BufferedReader(new InputStreamReader(in));
	StringReader sr; // we need to re-read each line into its own reader because the tokenizer is over-complicated garbage
	PTBTokenizer tkzr; // tokenizer object
	WordStemmer ls = new WordStemmer(); // stemmer/lemmatizer object

	// Read File Line By Line
	String strLine;
	while ((strLine = br.readLine()) != null) {
	System.out.println ("Tokenizing and Parsing: "+strLine); // print current line to console

	// do all the standard java over-complication to use the stanford parser tokenizer
	sr = new StringReader(strLine);
	tkzr = PTBTokenizer.newPTBTokenizer(sr);
	List toks = tkzr.tokenize();
	System.out.println ("tokens: "+toks);

	Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something

	// Output Option 1: Printing out various data by accessing it programmatically

	// Get words, stemmed words and POS tags
	ArrayList<String> words = new ArrayList();
	ArrayList<String> stems = new ArrayList();
	ArrayList<String> tags = new ArrayList();

	// Get words and Tags
	for (TaggedWord tw : parse.taggedYield()){
	words.add(tw.word());
	tags.add(tw.tag());
	}

	// Get stems
	ls.visitTree(parse); // apply the stemmer to the tree
	for (TaggedWord tw : parse.taggedYield()){
	stems.add(tw.word());
	}

	// Get dependency tree
	TreebankLanguagePack tlp = new PennTreebankLanguagePack();
	GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
	GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
	Collection tdl = gs.typedDependenciesCollapsed();

	// And print!
	System.out.println("words: "+words);
	System.out.println("POStags: "+tags);
	System.out.println("stemmedWordsAndTags: "+stems);
	System.out.println("typedDependencies: "+tdl);

	// Output Option 2: Printing out various data using TreePrint

	// Various TreePrint options
	// "penn", // constituency parse
	// "oneline",
	// rootLabelOnlyFormat,
	// "words",
	// "wordsAndTags", // unstemmed words and pos tags
	// "dependencies", // unlabeled dependency parse
	// "typedDependencies", // dependency parse
	// "typedDependenciesCollapsed",
	// "latexTree",
	// "collocations",
	// "semanticGraph"

	// Print using TreePrint with various options
	//TreePrint tp = new TreePrint("wordsAndTags,typedDependencies");
	//tp.printTree(parse);

	System.out.println(); // separate output lines
	}

	}

	}
	package foo;

	//Standard Java libraries
	import java.io.*;
	import java.util.*;

	import syntax.Protein;
	import syntax.AnnotationType;
	import syntax.Token;
	import util.Pair;
	import util.Util;

	// Stanford Parser
	import edu.stanford.nlp.trees.*;
	import edu.stanford.nlp.ling.HasWord;
	import edu.stanford.nlp.ling.Word;
	import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
	import edu.stanford.nlp.process.PTBTokenizer; // tokenizer
	import edu.stanford.nlp.process.DocumentPreprocessor; // sentence splitter


	public class RunStanfordParser2 {

	// Stuff related to stanford parser
	DocumentPreprocessor splitter; // sentence splitter
	PTBTokenizer tkzr; // tokenizer object
	WordStemmer ls; // stemmer/lemmatizer object
	LexicalizedParser lp; // parser object
	StringReader sr;

	/**
	* Construct a wrapper around Stanford Parser for creating .dep and .morph files
	* @param parserFileOrUrl path to serialized Stanford Parser
	*/
	public parserWrapper(String parserFileOrUrl){
	splitter = new DocumentPreprocessor();
	ls = new WordStemmer();
	lp = new LexicalizedParser(parserFileOrUrl);
	//lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"}); // set max sentence length if you want
	}

	/**
	* Return a filename without the extension
	* @param fullpath path to a file
	* @return
	*/
	private static String getFileNameFromPath(String fullpath){
	String filename = new File(fullpath).getName();
	return filename.substring(0,filename.lastIndexOf('.'));
	}

	/**
	* Does a file with the expected suffix exist? If not throw an exception.
	* @param path
	* @param filename
	* @param suffix
	* @throws Exception
	*/
	public void existsFileWithSuffix(String path, String filename, String suffix) throws Exception{
	File fSS = new File(path+filename+suffix);
	if( !fSS.exists() )
	throw new Exception("\n"+fSS.toString()+"\n"+"missing preprocessed file with suffix -"+suffix);
	}

	public void processGENIAfiles(String inputPath, String outputPath) throws Exception {
	// Create the output directory if it doesn't exist
	if(! (new File(outputPath).exists()) ){ new File(outputPath).mkdirs(); }
	// Append a slash to the paths for ease of file reading/creation
	if( !outputPath.endsWith("/") ){ outputPath = outputPath+"/"; }
	if( !inputPath.endsWith("/") ){ inputPath = inputPath+"/"; }

	// Look at files in the input path
	File dir = new File(inputPath);
	FileFilter fileFilter = new FileFilter() {
	public boolean accept(File file) {
	return file.isFile() && file.getName().endsWith(".txt") ;
	}
	};
	File[] files = dir.listFiles(fileFilter);

	System.out.println("Processing "+files.length+" files...");


	int count = 0;
	int total = files.length;
	for(File f:files){
	String filename = getFileNameFromPath(f.toString());

	System.out.println("File:"+filename);

	// Check if all necessary files exist!
	existsFileWithSuffix(inputPath, filename, ".txt");
	existsFileWithSuffix(inputPath, filename, ".standoff");
	existsFileWithSuffix(inputPath, filename, ".tagNew");
	existsFileWithSuffix(inputPath, filename, ".a1");

	String txtFile = inputPath+filename+".txt";
	String standoffFile = inputPath+filename+".standoff";
	String tagFile = inputPath+filename+".tagNew";
	//String tagFile = inputPath+filename+".tag";
	String a1File = inputPath+filename+".a1";

	//Workflow
	// 1. Load standoff file to get sentence offsets
	// 2. Load the text file as a whole (for word offset calculation after the parsing step)
	// 3. Run the parser loop- basically read each set of tokens in the tag file (each sentence) and do the processing there

	HashMap<Integer, int[]> standoff = loadStandoffFile(standoffFile);
	String fullText = Util.readFileAsString(txtFile);
	parseTagFile(tagFile, standoff, fullText, outputPath, filename); // parser loop- see function for bulk of text processing
	count++;
	System.out.println("parsed file "+count+"/"+total);
	}

	}

	/**
	* HACK ALERT- GENIA standoff data is sometimes missing for last sentence, this hack fixes it
	* @param standoff
	* @param index
	* @param fullText
	* @return
	*/
	private Pair<Integer, Integer> getStandOff(HashMap<Integer, int[]> standoff, int index, String fullText){
	int[] standOffArr = standoff.get(index);
	int start = 0;
	int stop = 0;
	try {
	start = standOffArr[0];
	stop = standOffArr[1];
	}
	catch(Exception e) {
	start = standoff.get(index-1)[1]+1;
	stop = fullText.length();
	}

	return new Pair(start,stop);
	}

	/**
	* Parse the tag file produced by the GENIA tagger, do some processing and reconcile this with the A1 file
	* Then print the .dep and .morph output files
	* @param path path to current document
	* @param standoff standoff file for all sentences in current document
	* @param fullText full text of current document
	* @param proteins all entities in current document
	* @param outputPath path to output files
	* @param filename filename without suffix for current document
	* @throws Exception
	*/
	private void parseTagFile(String path, HashMap<Integer, int[]> standoff, String fullText, String outputPath, String filename) throws Exception{
	int sentenceCounter = 0;
	try { //

	// Prepare the outputfiles
	File fDep = new File(outputPath+filename+".dep");
	File fMorph = new File(outputPath+filename+".morph");
	fDep.createNewFile();
	fMorph.createNewFile();
	BufferedWriter depFile = new BufferedWriter(new FileWriter(fDep));
	BufferedWriter morphFile = new BufferedWriter(new FileWriter(fMorph));

	// All the stuff for the specific file
	File f = new File(path);
	BufferedReader reader = new BufferedReader(new FileReader(f));
	String line = null;
	int tokenIndex = 0;
	ArrayList<Token> tokens = new ArrayList();
	while ((line=reader.readLine()) != null) {
	// NOTE: The input file is a series of lines of tokens, with a blank space separating tokens for different sentences.
	// We consume tokens until we hit a blank line, and then process the sentence.
	if(!line.isEmpty()){
	String elements[] = line.split("\\t"); // text, stem, pos, chunkTag, neTag
	String tokText = elements[0];
	String tokStem = elements[1];
	String tokPOS = elements[2];
	String nounChunk = elements[3];
	String neChunk = elements[4];

	Token tok = new Token(tokenIndex,tokText,tokStem, tokPOS, nounChunk, neChunk);
	tokens.add(tok);
	tokenIndex++;
	}
	else { // empty line- parse the sentence whose tokens were collected
	//System.out.println("sentence#="+sentenceCounter);

	// Parse
	Pair<HashMap<Integer, Token>, ArrayList<Object[]>> pair = null;
	try {

	// Workflow:
	// 1. Get sentence start and stop offsets
	// 2. Parse sentence in file
	// 3. Add GENIA tagging information to parsed tokens
	// 4. Reconcile parsed sentence with all applicable entities in entity file
	// 5. Print .dep and .morph files
	// 6. In case of failed parse, print .morph file and empty entry in the .dep file

	// Get standoff data
	Pair<Integer,Integer> st = getStandOff(standoff, sentenceCounter, fullText);
	int start = st.a;
	int stop = st.b;

	// Parse
	pair = parseSentence(tokens,start,stop,fullText);

	// Add GENIA info to parsed tokens- noun and named entity chunking
	// NOTE: Currently tokens omitted from parse representation (prepositions, etc) are not added back in
	HashMap<Integer, Token> parsedToks = pair.a;
	addGENIAInfoToTokens(tokens,parsedToks);

	// Print file out
	printSentenceToDepAndMorph(pair,depFile,morphFile);
	} catch(Exception e){
	// When we cannot parse a sentence
	// 1. Catch the exception and print an error message
	System.err.println("Bad Parse on "+filename+".txt"+", sentence #"+(sentenceCounter+1));
	// 2. Print the tokens out to the .morph file anyways
	// 3. Make an empty entry in the .dep file
	printSentenceToDepAndMorph(tokens,depFile,morphFile);
	}




	sentenceCounter++;
	tokens = new ArrayList();
	tokenIndex = 0;

	System.out.print("."); // progress counter
	}
	}
	System.out.println("."); // end of progress counter

	// Close output files
	depFile.close();
	morphFile.close();
	System.out.println("\t"+"created "+fDep.getPath());
	System.out.println("\t"+"created "+fMorph.getPath());
	}
	catch(Exception e) {
	System.err.println("Fatal Parse Error - skipping file "+filename+".txt"+", sentence #"+(sentenceCounter+1));
	}
	}

	/**
	* Add GENIA tagger info to parsed tokens- specifically noun and named entity chunking info.
	* @param tokens
	* @param wordMap
	*/
	private void addGENIAInfoToTokens(ArrayList<Token> tokens, HashMap<Integer, Token> wordMap) {
	// NOTE: There are not the same number of objects in tokens as wordMap (parsing removes preps. and stuff from wordMap)
	// thus: len(tokens) >= len(wordMap.values)

	// We are creating a new wordmap object
	//HashMap<Integer, Object[]> wordMap = new HashMap();

	List<Object[]> words = new ArrayList(wordMap.values());
	int j=0;
	for(int i=0; i<tokens.size();i++){
	Token tok = tokens.get(i);
	j=i+1; // Dependency indices start 1, not 0, so make the token indices match
	if(wordMap.containsKey(j)){
	Token parsedTok = wordMap.get(j);
	parsedTok.chunkTag = tok.chunkTag;
	parsedTok.neTag = tok.neTag;
	}
	// else {
	// // We add non-parsed words to the wordMap because they may contain GENIA tagging info for noun or ne chunks
	// // NOTE: offsets are not calculated in the parser stage for these words
	// // NOTE: -1 offset means disregard!!!
	// Object[] wordArr = {tok.partOfSpeech, tok.text, tok.stem, -1, -1, tok.chunkTag, tok.neTag};
	// wordMap.put(j, wordArr);
	// }
	}
	}


	/**
	* Run Stanford Parser on a string (one sentence)
	* NOTE: The start/stop is necessary to calculate the offsets
	* @param sentence a sentence to parse
	* @param allText text sentence is part of
	* @param start index of sentence starting point in allText
	* @param stop index of end of sentence in allText
	* @params sentenceTokens for calculating individual words offsets (kind of a hack)
	* @throws Exception
	*/
	private Pair<HashMap<Integer, Token>, ArrayList<Object[]>> parseSentence(ArrayList<Token> tok, int start, int stop, String fullText) throws Exception{
	//System.out.println("start,stop=<"+start+","+stop+">");

	// Create parser input from genia-tagged input
	ArrayList<Word> toksW = new ArrayList();
	for(Token t:tok){
	toksW.add( new Word(t.text) );
	}

	Tree parse = (Tree) lp.apply((List)toksW); // finally, we actually get to parse something
	TreebankLanguagePack tlp = new PennTreebankLanguagePack();
	GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
	GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
	//Collection tdl = gs.typedDependenciesCollapsed(); //NOTE: Using Un-collapsed dependencies
	Collection tdl = gs.typedDependencies();
	// Data structures
	HashMap<Integer,Token> wordMap = new HashMap<Integer, Token>(); // Holds values we build with .morph ( index -> { POS tag, word, stem, start offset, end offset } )
	ArrayList<Object[]> relnList = new ArrayList<Object[]>(); // For .dep, holds a arrays of form { relation, index head, index child }

	// We will walk over the dependency parse, pull out the indicies, then do the same but for the stemmed parse

	// Unstemmed parse
	for( Iterator<TypedDependency> iter = tdl.iterator(); iter.hasNext(); ) {
	TypedDependency var = iter.next();

	TreeGraphNode dep = var.dep();
	TreeGraphNode gov = var.gov();

	// All useful information for a node in the tree
	String reln = var.reln().getShortName();
	int depIdx = var.dep().index();
	int govIdx = var.gov().index();
	Object relnArr[] = {reln, govIdx, depIdx};
	relnList.add(relnArr);

	Token depTok = new Token(depIdx, dep.label().tag(), dep.label().value());
	Token govTok = new Token(govIdx, gov.label().tag(), gov.label().value());

	wordMap.put(depIdx, depTok);
	wordMap.put(govIdx, govTok);
	}

	// Stemmed parse
	// Get stems!
	ls.visitTree(parse); // apply the stemmer to the tree
	gs = gsf.newGrammaticalStructure(parse);
	tdl = gs.typedDependenciesCollapsed();
	for( Iterator<TypedDependency> iter = tdl.iterator(); iter.hasNext(); ) {
	TypedDependency var = iter.next();
	TreeGraphNode dep = var.dep();
	TreeGraphNode gov = var.gov();

	int depIdx = dep.index();
	if( wordMap.containsKey(depIdx))
	wordMap.get( depIdx ).stem = dep.value();

	int govIdx = gov.index();
	if( wordMap.containsKey(govIdx))
	wordMap.get( govIdx ).stem = gov.value();
	}

	calculateWordOffsets(wordMap, fullText, start, stop, toksW);

	Pair<HashMap<Integer, Token>, ArrayList<Object[]>> pair = new Pair(wordMap, relnList);
	return pair;
	}

	private void printSentenceToDepAndMorph(Pair<HashMap<Integer, Token>, ArrayList<Object[]>> pair, BufferedWriter depFile, BufferedWriter morphFile) throws IOException{
	HashMap<Integer, Token> tokens = pair.a;
	ArrayList<Object[]> relnList = pair.b;

	// Print .morph
	List keys = new ArrayList(tokens.keySet()); Collections.sort(keys); // get tokens, in order
	for(Object k:keys){
	int key = Integer.parseInt(k.toString());
	Token tok = tokens.get(key);
	morphFile.write(tok.index+"\t");
	morphFile.write(tok.pos+"\t");
	morphFile.write(tok.text+"\t");
	morphFile.write(tok.stem+"\t");
	morphFile.write(tok.startOffset+"\t");
	morphFile.write(tok.endOffset+"\t");
	morphFile.write(tok.chunkTag+"\t");
	morphFile.write(tok.neTag+"\t");
	morphFile.write(tok.isAcronym+"\t");
	morphFile.write(tok.isPartialAcronymMatch+"\t");
	morphFile.write(tok.isProteinAcronymMatch+"\t");
	morphFile.write(tok.acronym+"\t");
	morphFile.write(tok.acronymExpandedText+"\t");
	morphFile.write(tok.proteinAcronym+"\t");
	morphFile.write("\n");
	}
	morphFile.newLine();

	//print .dep
	for(Object[] relnArr: relnList){
	String output = relnArr[0]+"\t"+relnArr[1]+"\t"+relnArr[2];
	depFile.write(output+"\n");
	}
	depFile.newLine();
	}

	private void printSentenceToDepAndMorph(ArrayList<Token> toks, BufferedWriter depFile, BufferedWriter morphFile) throws IOException{
	for(Token tok:toks){
	morphFile.write(tok.index+"\t");
	morphFile.write(tok.pos+"\t");
	morphFile.write(tok.text+"\t");
	morphFile.write(tok.stem+"\t");
	morphFile.write(tok.startOffset+"\t");
	morphFile.write(tok.endOffset+"\t");
	morphFile.write(tok.chunkTag+"\t");
	morphFile.write(tok.neTag+"\t");
	morphFile.write(tok.isAcronym+"\t");
	morphFile.write(tok.isPartialAcronymMatch+"\t");
	morphFile.write(tok.isProteinAcronymMatch+"\t");
	morphFile.write(tok.acronym+"\t");
	morphFile.write(tok.acronymExpandedText+"\t");
	morphFile.write(tok.proteinAcronym+"\t");
	morphFile.write("\n");
	}
	morphFile.newLine();

	//print emtpy .dep
	depFile.newLine();
	}

	/**
	* Load GENIA sentence splitter standoff file (shows sentence boundaries)
	* @param path
	* @return hashmap<sentence_number, {start,stop}>
	* @throws Exception
	*/
	private HashMap<Integer, int[]> loadStandoffFile(String path) throws Exception {
	HashMap<Integer , int[]> standoff = new HashMap();

	File f = new File(path);
	BufferedReader reader = new BufferedReader(new FileReader(f));
	String line = null;
	int sentenceCounter = 0;
	while ((line=reader.readLine()) != null) {
	if( !line.isEmpty() ){
	String elements[] = line.split("\\t"); // format: text, stem, pos, chunkTag, neTag
	int[] startstop = { Integer.parseInt(elements[0]), Integer.parseInt(elements[1]) };
	standoff.put(sentenceCounter, startstop);
	sentenceCounter++;
	}
	}
	return standoff;
	}


	/**
	* Is this a whitespace character
	* @param c character
	* @return truth
	*/
	private boolean IsWhiteSpace(char c){ if( c == '\n' \|\| c == ' ' \|\| c == '\r'){ return true; } else { return false; } }

	/**
	* Walk through the text and match each non-whitespace token in the tokenized sentence until completion.
	* Some words (like prepositions and parens) aren't in the parse, so we need to make sure we can skip over them nicely and not break.
	* NOTE: The whole thing is predicated on the fact that we are looping over the sentence, tokens and parse in order
	* so the first word X or Y we see is the same in all 3 of these.
	* @param wordMap Results of dependency parse (parseIndex-word mappings)
	* @param text Block of text
	* @param startIdx start of sentence
	* @param stopIdx end of sentence
	* @param sent0 Sentence tokens
	* @throws Exception
	*/
	private void calculateWordOffsets(HashMap<Integer, Token> wordMap, String text, int startIdx, int stopIdx, List<Word> sent0) throws Exception {
	// Parsed word data structure
	Object[] keys = wordMap.keySet().toArray();
	Arrays.sort(keys); // sort indicies by smallest to largest
	int wordArrIdx = 0;

	// get into the correct position
	int offset = startIdx;
	int start = 0;
	text = text.substring(startIdx, stopIdx);

	for(Word word : sent0){
	String w = word.toString();
	String originalW = edu.stanford.nlp.process.PTBTokenizer.ptbToken2Text(w);

	// first trim off any leading whitespace
	while( IsWhiteSpace(text.charAt(0)) ){
	text = text.substring(1);
	offset++; //increment the offset counter
	}

	// now see if our word matches
	start = offset;
	char firstChar = originalW.charAt(0);
	if( firstChar == text.charAt(0)){
	// Is this the word in the token?
	if( text.startsWith(originalW, 0) ){
	int len = originalW.length();
	offset+=len;
	text = text.substring(len);

	// Is this token a word in the parse?
	Token tok = wordMap.get(keys[wordArrIdx]);
	String wParse = tok.text;

	wParse = wParse.replaceAll("\\\\/", "/"); // java regexes are awful: "\\\\" == "\"
	//System.out.println("wordmap=/"+wParse+"/ vs hasword=/"+originalW+"/"+" wordArrIdx="+wordArrIdx);

	if ( wParse.equals(originalW) ){
	//System.out.println("\t"+originalW+" ("+start+","+offset+")");
	tok.startOffset = start;
	tok.endOffset = offset;
	wordArrIdx++;

	if(wordArrIdx >= keys.length)
	break;
	}

	}
	} else {
	System.out.println("w:"+word+", originalW"+originalW);
	System.out.println("firstChar:"+firstChar+", text.charAt(0)"+text.charAt(0));
	throw new Exception("unknown token");

	}
	}
	}

	/**
	* Turn Stanford Parser sentence splitter output into a string (for a single sentence)
	* @param s sentence list
	* @return sentence as string
	*/
	public static String join(List<String> s) {
	if (s.isEmpty()) return "";
	String delimiter = " ";
	Iterator<String> iter = s.iterator();
	StringBuffer buffer = new StringBuffer(iter.next().toString());
	while (iter.hasNext()) buffer.append(delimiter).append(iter.next());
	return buffer.toString();
	}

	}