Skip to content

Instantly share code, notes, and snippets.

@allanj
Created April 10, 2019 09:54
Show Gist options
  • Save allanj/3b62d4451d1b259166194eaac102379f to your computer and use it in GitHub Desktop.
Save allanj/3b62d4451d1b259166194eaac102379f to your computer and use it in GitHub Desktop.
Script for dependency parsing for the dataset
package corenlp.process;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.parser.nndep.DependencyParser;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.TypedDependency;
import statistics.RAWF;
public class Converter {
String modelPath ;
public Converter(String modelPath) {
this.modelPath = modelPath;
}
/**
* Read CoNLL-2003
* @param path
* @param writePath
* @throws IOException
*/
public void readData(String path, String writePath) throws IOException{
DependencyParser parser = DependencyParser.loadFromModelFile(this.modelPath);
BufferedReader br = RAWF.reader(path);
PrintWriter pw = RAWF.writer(writePath);
String line = null;
List<CoreLabel> words = new ArrayList<>();
ArrayList<String> output = new ArrayList<String>();
while((line = br.readLine())!=null){
if(line.equals("")){
GrammaticalStructure gs = parser.predict(words);
List<TypedDependency> deps = new ArrayList<>(gs.typedDependencies());
int[] heads = new int[words.size()];
String[] depLabels = new String[words.size()];
for (TypedDependency dep : deps) {
heads[dep.dep().index() - 1] = dep.gov().index() - 1;
depLabels[dep.dep().index() - 1] = dep.reln().getShortName();
// System.out.println(dep.gov().index() +", " +dep.dep().index()+ ", " + dep.reln().getShortName());
}
// System.exit(0);
for (int p = 0; p < words.size(); p++) {
CoreLabel word = words.get(p);
int head = heads[p] + 1;
pw.println((p+1) + "\t"+ word.word() + "\t_\t" +word.tag()+"\t"+word.tag()+"\t_\t"+head+"\t"+depLabels[p]+"\t_\t_\t"+output.get(p) );
}
pw.println();
words = new ArrayList<CoreLabel>();
output = new ArrayList<String>();
} else {
String[] values = line.split(" ");
String entity = values[2];
output.add(entity);
CoreLabel token = new CoreLabel();
token.setWord(values[0]);
token.setTag(values[1]);
words.add(token);
}
}
br.close();
pw.close();
}
/**
* Read CoNLL-2003
* @param path
* @param writePath
* @throws IOException
*/
public void readOntoNotes(String path, String writePath) throws IOException{
DependencyParser parser = DependencyParser.loadFromModelFile(this.modelPath);
BufferedReader br = RAWF.reader(path);
PrintWriter pw = RAWF.writer(writePath);
String line = null;
List<CoreLabel> words = new ArrayList<>();
ArrayList<String> output = new ArrayList<String>();
while((line = br.readLine())!=null){
if(line.equals("")){
GrammaticalStructure gs = parser.predict(words);
List<TypedDependency> deps = new ArrayList<>(gs.typedDependencies());
int[] heads = new int[words.size()];
String[] depLabels = new String[words.size()];
for (TypedDependency dep : deps) {
heads[dep.dep().index() - 1] = dep.gov().index() - 1;
depLabels[dep.dep().index() - 1] = dep.reln().getShortName();
// System.out.println(dep.gov().index() +", " +dep.dep().index()+ ", " + dep.reln().getShortName());
}
// System.exit(0);
for (int p = 0; p < words.size(); p++) {
CoreLabel word = words.get(p);
int head = heads[p] + 1;
pw.println((p+1) + "\t"+ word.word() + "\t_\t" +word.tag()+"\t"+word.tag()+"\t_\t"+head+"\t"+depLabels[p]+"\t_\t_\t"+output.get(p) );
}
pw.println();
words = new ArrayList<CoreLabel>();
output = new ArrayList<String>();
} else {
String[] values = line.split("\t");
String word = values[1];
String pos = values[3];
String entity = values[values.length - 1];
output.add(entity);
CoreLabel token = new CoreLabel();
token.setWord(word);
token.setTag(pos);
words.add(token);
}
}
br.close();
pw.close();
}
public static void main(String... args) throws IOException {
String[] x = new String[]{"SD", "UD"};
for(String type : x) {
String path = "edu/stanford/nlp/models/parser/nndep/english_"+type+".gz";
Converter conv = new Converter(path);
// conv.readData("data/conll2003/train.txt", "data/conll2003/train."+type.toLowerCase()+".conllx");
// conv.readData("data/conll2003/dev.txt", "data/conll2003/dev."+type.toLowerCase()+".conllx");
// conv.readData("data/conll2003/test.txt", "data/conll2003/test."+type.toLowerCase()+".conllx");
conv.readOntoNotes("data/ontonotes/train.sd.conllx", "data/ontonotes/train.pred"+type.toLowerCase()+".conllx");
conv.readOntoNotes("data/ontonotes/dev.sd.conllx", "data/ontonotes/dev.pred"+type.toLowerCase()+".conllx");
conv.readOntoNotes("data/ontonotes/test.sd.conllx", "data/ontonotes/test.pred"+type.toLowerCase()+".conllx");
}
}
// public static void main(String... args) {
// String modelPath = DependencyParser.DEFAULT_MODEL;
// String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
//
// for (int argIndex = 0; argIndex < args.length; ) {
// switch (args[argIndex]) {
// case "-tagger":
// taggerPath = args[argIndex + 1];
// argIndex += 2;
// break;
// case "-model":
// modelPath = args[argIndex + 1];
// argIndex += 2;
// break;
// default:
// throw new RuntimeException("Unknown argument " + args[argIndex]);
// }
// }
//
// String text = "I can almost always tell when movies use fake dinosaurs.";
//
// MaxentTagger tagger = new MaxentTagger(taggerPath);
//
//
//// DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
//// for (List<HasWord> sentence : tokenizer) {
//// List<TaggedWord> tagged = tagger.tagSentence(sentence);
//// GrammaticalStructure gs = parser.predict(tagged);
//// System.out.println(gs.toString());
//// }
//
// CoreLabel token = new CoreLabel();
// token.setWord("I");
// token.setTag("NN");
// List<CoreLabel> sent = new ArrayList<>();
// sent.add(token);
// GrammaticalStructure gs = parser.predict(sent);
// System.out.println(gs.toString());
// }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment