Created
June 3, 2016 00:07
-
-
Save eellpp/aa5ca35f97027e42d5df6663e95cc27c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* testApp.java | |
* | |
* Copyright (c) 2000-2012, The University of Sheffield. | |
* | |
* This file is part of GATE (see http://gate.ac.uk/), and is free | |
* software, licenced under the GNU Library General Public License, | |
* Version 3, 29 June 2007. | |
* | |
* A copy of this licence is included in the distribution in the file | |
* licence.html, and is also available at http://gate.ac.uk/gate/licence.html. | |
* | |
* sandeepchellappen, 1/6/2016 | |
* | |
* For details on the configuration options, see the user guide: | |
* http://gate.ac.uk/cgi-bin/userguide/sec:creole-model:config | |
*/ | |
package com.mytests; | |
import gate.*; | |
import gate.creole.*; | |
import gate.creole.metadata.*; | |
import gate.util.*; | |
import java.util.*; | |
import java.io.*; | |
import java.net.*; | |
import gate.*; | |
import gate.creole.*; | |
import gate.util.*; | |
import gate.util.persistence.PersistenceManager; | |
import gate.corpora.RepositioningInfo; | |
/** | |
* This class is the implementation of the resource TESTAPP. | |
*/ | |
@CreoleResource(name = "testApp", | |
comment = "Add a descriptive comment about this resource") | |
public class testApp { | |
/** The Corpus Pipeline application to contain ANNIE */ | |
private CorpusController annieController; | |
/** | |
* Initialise the ANNIE system. This creates a "corpus pipeline" | |
* application that can be used to run sets of documents through | |
* the extraction system. | |
*/ | |
public void initAnnie() throws GateException, IOException { | |
Out.prln("Initialising ANNIE..."); | |
// load the ANNIE application from the saved state in plugins/ANNIE | |
File pluginsHome = Gate.getPluginsHome(); | |
File anniePlugin = new File(pluginsHome, "ANNIE"); | |
// File annieGapp = new File(anniePlugin, "ANNIE_with_defaults.gapp"); | |
File annieGapp = new File("/Users/sandeepchellappen/Work/Learning/nlp/GateTuorial", "testApp.gapp"); | |
annieController = | |
(CorpusController) PersistenceManager.loadObjectFromFile(annieGapp); | |
Out.prln("...ANNIE loaded"); | |
} // initAnnie() | |
/** Tell ANNIE's controller about the corpus you want to run on */ | |
public void setCorpus(Corpus corpus) { | |
annieController.setCorpus(corpus); | |
} // setCorpus | |
/** Run ANNIE */ | |
public void execute() throws GateException { | |
Out.prln("Running ANNIE..."); | |
annieController.execute(); | |
Out.prln("...ANNIE complete"); | |
} // execute() | |
/** | |
* Run from the command-line, with a list of URLs as argument. | |
* <P><B>NOTE:</B><BR> | |
* This code will run with all the documents in memory - if you | |
* want to unload each from memory after use, add code to store | |
* the corpus in a DataStore. | |
*/ | |
public static void main(String args[]) throws GateException, IOException { | |
// initialise the GATE library | |
Out.prln("Initialising GATE..."); | |
Gate.init(); | |
Out.prln("...GATE initialised"); | |
// initialise ANNIE (this may take several minutes) | |
testApp annie = new testApp(); | |
annie.initAnnie(); | |
// create a GATE corpus and add a document for each command-line | |
// argument | |
Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus"); | |
for(int i = 0; i < args.length; i++) { | |
URL u = new URL(args[i]); | |
FeatureMap params = Factory.newFeatureMap(); | |
params.put("sourceUrl", u); | |
params.put("preserveOriginalContent", new Boolean(true)); | |
params.put("collectRepositioningInfo", new Boolean(true)); | |
Out.prln("Creating doc for " + u); | |
Document doc = (Document) | |
Factory.createResource("gate.corpora.DocumentImpl", params); | |
corpus.add(doc); | |
} // for each of args | |
// tell the pipeline about the corpus and run it | |
annie.setCorpus(corpus); | |
annie.execute(); | |
// for each document, get an XML document with the | |
// person and location names added | |
Iterator iter = corpus.iterator(); | |
int count = 0; | |
String startTagPart_1 = "<span GateID=\""; | |
String startTagPart_2 = "\" title=\""; | |
String startTagPart_3 = "\" style=\"background:Red;\">"; | |
String endTag = "</span>"; | |
while(iter.hasNext()) { | |
Document doc = (Document) iter.next(); | |
AnnotationSet defaultAnnotSet = doc.getAnnotations(); | |
// Get all the sentence annotations and for each sentence | |
// print all the tokens and their associated annotation | |
String[] reqTypes = {"Sentence"}; | |
AnnotationSet sentAnns = annie.getAllAnnotationForTypes(defaultAnnotSet ,reqTypes); | |
for(Annotation sent: sentAnns){ | |
Long startOffset = sent.getStartNode().getOffset(); | |
Long endOffset = sent.getEndNode().getOffset(); | |
// Get the annotation set within this range | |
String[] reqTypes1 = {"Location"}; | |
Set<Annotation> anns = annie.getAllAnnotationForTypesWithinOffset(defaultAnnotSet ,reqTypes1,startOffset,endOffset); | |
for(Annotation ann: anns){ | |
String word = gate.Utils.stringFor(doc,ann); | |
String type = ann.getType(); | |
System.out.println(word + " : " + type); | |
} | |
} | |
// Set annotTypesRequired = new HashSet(); | |
// annotTypesRequired.add("Person"); | |
// annotTypesRequired.add("Location"); | |
// Set<Annotation> peopleAndPlaces = | |
// new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired)); | |
// | |
// FeatureMap features = doc.getFeatures(); | |
// String originalContent = (String) | |
// features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); | |
// RepositioningInfo info = (RepositioningInfo) | |
// features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); | |
// | |
// ++count; | |
// File file = new File("StANNIE_" + count + ".HTML"); | |
// Out.prln("File name: '"+file.getAbsolutePath()+"'"); | |
// if(originalContent != null && info != null) { | |
// Out.prln("OrigContent and reposInfo existing. Generate file..."); | |
// | |
// Iterator it = peopleAndPlaces.iterator(); | |
// Annotation currAnnot; | |
// SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); | |
// | |
// while(it.hasNext()) { | |
// currAnnot = (Annotation) it.next(); | |
// sortedAnnotations.addSortedExclusive(currAnnot); | |
// } // while | |
// | |
// StringBuffer editableContent = new StringBuffer(originalContent); | |
// long insertPositionEnd; | |
// long insertPositionStart; | |
// // insert anotation tags backward | |
// Out.prln("Unsorted annotations count: "+peopleAndPlaces.size()); | |
// Out.prln("Sorted annotations count: "+sortedAnnotations.size()); | |
// for(int i=sortedAnnotations.size()-1; i>=0; --i) { | |
// currAnnot = (Annotation) sortedAnnotations.get(i); | |
// insertPositionStart = | |
// currAnnot.getStartNode().getOffset().longValue(); | |
// insertPositionStart = info.getOriginalPos(insertPositionStart); | |
// insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); | |
// insertPositionEnd = info.getOriginalPos(insertPositionEnd, true); | |
// if(insertPositionEnd != -1 && insertPositionStart != -1) { | |
// editableContent.insert((int)insertPositionEnd, endTag); | |
// editableContent.insert((int)insertPositionStart, startTagPart_3); | |
// editableContent.insert((int)insertPositionStart, | |
// currAnnot.getType()); | |
// editableContent.insert((int)insertPositionStart, startTagPart_2); | |
// editableContent.insert((int)insertPositionStart, | |
// currAnnot.getId().toString()); | |
// editableContent.insert((int)insertPositionStart, startTagPart_1); | |
// } // if | |
// } // for | |
// | |
// FileWriter writer = new FileWriter(file); | |
// writer.write(editableContent.toString()); | |
// writer.close(); | |
// } // if - should generate | |
// else if (originalContent != null) { | |
// Out.prln("OrigContent existing. Generate file..."); | |
// | |
// Iterator it = peopleAndPlaces.iterator(); | |
// Annotation currAnnot; | |
// SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); | |
// | |
// while(it.hasNext()) { | |
// currAnnot = (Annotation) it.next(); | |
// sortedAnnotations.addSortedExclusive(currAnnot); | |
// } // while | |
// | |
// StringBuffer editableContent = new StringBuffer(originalContent); | |
// long insertPositionEnd; | |
// long insertPositionStart; | |
// // insert anotation tags backward | |
// Out.prln("Unsorted annotations count: "+peopleAndPlaces.size()); | |
// Out.prln("Sorted annotations count: "+sortedAnnotations.size()); | |
// for(int i=sortedAnnotations.size()-1; i>=0; --i) { | |
// currAnnot = (Annotation) sortedAnnotations.get(i); | |
// insertPositionStart = | |
// currAnnot.getStartNode().getOffset().longValue(); | |
// insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); | |
// if(insertPositionEnd != -1 && insertPositionStart != -1) { | |
// editableContent.insert((int)insertPositionEnd, endTag); | |
// editableContent.insert((int)insertPositionStart, startTagPart_3); | |
// editableContent.insert((int)insertPositionStart, | |
// currAnnot.getType()); | |
// editableContent.insert((int)insertPositionStart, startTagPart_2); | |
// editableContent.insert((int)insertPositionStart, | |
// currAnnot.getId().toString()); | |
// editableContent.insert((int)insertPositionStart, startTagPart_1); | |
// } // if | |
// } // for | |
// | |
// FileWriter writer = new FileWriter(file); | |
// writer.write(editableContent.toString()); | |
// writer.close(); | |
// } | |
// else { | |
// Out.prln("Content : "+originalContent); | |
// Out.prln("Repositioning: "+info); | |
// } | |
// | |
// String xmlDocument = doc.toXml(peopleAndPlaces, false); | |
// String fileName = new String("StANNIE_toXML_" + count + ".HTML"); | |
// FileWriter writer = new FileWriter(fileName); | |
// writer.write(xmlDocument); | |
// writer.close(); | |
} // for each doc | |
} // main | |
private Set<Annotation> getAllAnnotationForTypesWithinOffset(AnnotationSet defaultAnnotSet, String[] reqTypes, | |
Long startOffset, Long endOffset) { | |
Set<Annotation> annotTypesRequired = Collections.emptySet(); | |
// Set<Annotation> peopleAndPlaces = | |
// new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired)); | |
for(String type: reqTypes){ | |
Set<Annotation> annSet = defaultAnnotSet.get(type,startOffset,endOffset); | |
annotTypesRequired.addAll(annSet); | |
} | |
return annotTypesRequired; | |
} | |
private AnnotationSet getAllAnnotationForTypes(AnnotationSet aSet ,String[] reqTypes) { | |
Set annotTypesRequired = new HashSet(); | |
for(String type: reqTypes){ | |
annotTypesRequired.add(type); | |
} | |
return (AnnotationSet) aSet.get(annotTypesRequired); | |
} | |
/** | |
* | |
*/ | |
public static class SortedAnnotationList extends Vector { | |
public SortedAnnotationList() { | |
super(); | |
} // SortedAnnotationList | |
public boolean addSortedExclusive(Annotation annot) { | |
Annotation currAnot = null; | |
// overlapping check | |
for (int i=0; i<size(); ++i) { | |
currAnot = (Annotation) get(i); | |
if(annot.overlaps(currAnot)) { | |
return false; | |
} // if | |
} // for | |
long annotStart = annot.getStartNode().getOffset().longValue(); | |
long currStart; | |
// insert | |
for (int i=0; i < size(); ++i) { | |
currAnot = (Annotation) get(i); | |
currStart = currAnot.getStartNode().getOffset().longValue(); | |
if(annotStart < currStart) { | |
insertElementAt(annot, i); | |
/* | |
Out.prln("Insert start: "+annotStart+" at position: "+i+" size="+size()); | |
Out.prln("Current start: "+currStart); | |
*/ | |
return true; | |
} // if | |
} // for | |
int size = size(); | |
insertElementAt(annot, size); | |
//Out.prln("Insert start: "+annotStart+" at size position: "+size); | |
return true; | |
} // addSorted | |
} // SortedAnnotationList | |
} // class testApp |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment