Created
April 8, 2012 14:04
-
-
Save anonymous/2337478 to your computer and use it in GitHub Desktop.
NLP PA4 Greedy Search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json, sys, collections | |
import base64 | |
from Datum import Datum | |
class FeatureData : | |
def __init__(self, words, prevLabel, position, | |
wordsDict): | |
self.w = words; | |
self.pL = prevLabel | |
self.pos = position | |
self.wD = wordsDict | |
class FeatureFactory: | |
""" | |
Add any necessary initialization steps for your features here | |
Using this constructor is optional. Depending on your | |
features, you may not need to intialize anything. | |
""" | |
def __init__(self): | |
self.prevLabels = {}; | |
self.ideas = self.createFeatureIdeas() | |
self.goodIdeas = [] #Once you find some good ideas, populate this list | |
pass | |
""" | |
Words is a list of the words in the entire corpus, previousLabel is the label | |
for position-1 (or O if it's the start of a new sentence), and position | |
is the word you are adding features for. PreviousLabel must be the | |
only label that is visible to this method. | |
""" | |
def addFeatureIdea(self, ideasList, features, conds = None) : | |
if isinstance(features, collections.Iterable) : | |
feature = lambda fd : \ | |
reduce (lambda x,y: x + y(fd) + ', ', features, '') | |
else : | |
feature = features | |
if conds == None : | |
cond = lambda fd: True | |
elif isinstance(conds, collections.Iterable) : | |
cond = lambda fd : \ | |
reduce(lambda x,y: x and y(fd), conds, True) | |
else : | |
cond = conds | |
tempFeature = FeatureData(['Prev', 'Curr', 'Next'], 'P', 1, | |
{'Prev':1, 'Curr':1, 'Next':1}) | |
print 'Idea', len(ideasList), ':', feature(tempFeature) | |
ideasList.append(lambda fd : cond(fd) and feature(fd)) | |
def createCase(self, t) : | |
return lambda fd : 'case=' + t | |
def notCond(self, cond) : | |
return lambda fd : not cond(fd) | |
def orConds(self, conds) : | |
return lambda fd : lambda fd : \ | |
reduce(lambda x,y: x or y(fd), conds, False) | |
def condsToFeature(self, text, conds) : | |
if isinstance(conds, collections.Iterable) : | |
return lambda fd : \ | |
text + '=' + (reduce(lambda x,y: x and y(fd), conds, False) and 'yes' or 'no') | |
else : | |
return lambda fd : \ | |
text + '=' + (conds(fd) and 'yes' or 'no') | |
def createFeatureIdeas(self): | |
featureIdeas = [] | |
currWord = lambda fd : "word=" + fd.w[fd.pos] | |
prevLabel = lambda fd : "prevLabel=" + fd.pL | |
currWordTitle = self.condsToFeature('CurrWordTitle', lambda fd : fd.w[fd.pos][0].isupper()) | |
hasPrev = lambda fd : fd.pos > 0 | |
prevWord = lambda fd : "prevWord=" + fd.w[fd.pos-1] | |
self.addFeatureIdea(featureIdeas, currWord) #0 | |
self.addFeatureIdea(featureIdeas, prevLabel) #1 | |
self.addFeatureIdea(featureIdeas, [currWord, currWordTitle]) #2 | |
self.addFeatureIdea(featureIdeas, [currWord, prevLabel]) #3 | |
self.addFeatureIdea(featureIdeas, prevWord, hasPrev) #4 | |
return featureIdeas | |
def computeFeatures(self, words, previousLabel, position, ideas=None): | |
if position == 0 : #Build Dictionaries | |
self.wordsDict = collections.defaultdict(lambda: 0) | |
for word in words : | |
self.wordsDict[word] += 1 | |
featureData = FeatureData(words, previousLabel, position, | |
self.wordsDict) | |
if ideas == None : | |
ideas = map(lambda x: self.ideas[x], self.goodIdeas) | |
features = [] | |
for idea in ideas : | |
feature = idea(featureData) | |
if feature : | |
features.append(feature) | |
return features | |
""" Do not modify this method """ | |
def readData(self, filename): | |
data = [] | |
for line in open(filename, 'r'): | |
line_split = line.split() | |
# remove emtpy lines | |
if len(line_split) < 2: | |
continue | |
word = line_split[0] | |
label = line_split[1] | |
datum = Datum(word, label) | |
data.append(datum) | |
return data | |
""" Do not modify this method """ | |
def readTestData(self, ch_aux): | |
data = [] | |
for line in ch_aux.splitlines(): | |
line_split = line.split() | |
# remove emtpy lines | |
if len(line_split) < 2: | |
continue | |
word = line_split[0] | |
label = line_split[1] | |
datum = Datum(word, label) | |
data.append(datum) | |
return data | |
""" Do not modify this method """ | |
def setFeaturesTrain(self, data, ideas=None): | |
newData = [] | |
words = [] | |
for datum in data: | |
words.append(datum.word) | |
## This is so that the feature factory code doesn't | |
## accidentally use the true label info | |
previousLabel = "O" | |
for i in range(0, len(data)): | |
datum = data[i] | |
newDatum = Datum(datum.word, datum.label) | |
newDatum.features = self.computeFeatures(words, previousLabel, i, ideas) | |
newDatum.previousLabel = previousLabel | |
newData.append(newDatum) | |
previousLabel = datum.label | |
return newData | |
""" | |
Compute the features for all possible previous labels | |
for Viterbi algorithm. Do not modify this method | |
""" | |
def setFeaturesTest(self, data, ideas=None): | |
newData = [] | |
words = [] | |
labels = [] | |
labelIndex = {} | |
for datum in data: | |
words.append(datum.word) | |
if not labelIndex.has_key(datum.label): | |
labelIndex[datum.label] = len(labels) | |
labels.append(datum.label) | |
## This is so that the feature factory code doesn't | |
## accidentally use the true label info | |
for i in range(0, len(data)): | |
datum = data[i] | |
if i == 0: | |
previousLabel = "O" | |
datum.features = self.computeFeatures(words, previousLabel, i, ideas) | |
newDatum = Datum(datum.word, datum.label) | |
newDatum.features = self.computeFeatures(words, previousLabel, i, ideas) | |
newDatum.previousLabel = previousLabel | |
newData.append(newDatum) | |
else: | |
for previousLabel in labels: | |
datum.features = self.computeFeatures(words, previousLabel, i, ideas) | |
newDatum = Datum(datum.word, datum.label) | |
newDatum.features = self.computeFeatures(words, previousLabel, i, ideas) | |
newDatum.previousLabel = previousLabel | |
newData.append(newDatum) | |
return newData | |
""" | |
write words, labels, and features into a json file | |
Do not modify this method | |
""" | |
def writeData(self, data, filename): | |
outFile = open(filename + '.json', 'w') | |
for i in range(0, len(data)): | |
datum = data[i] | |
jsonObj = {} | |
jsonObj['_label'] = datum.label | |
jsonObj['_word']= base64.b64encode(datum.word) | |
jsonObj['_prevLabel'] = datum.previousLabel | |
featureObj = {} | |
features = datum.features | |
for j in range(0, len(features)): | |
feature = features[j] | |
featureObj['_'+feature] = feature | |
jsonObj['_features'] = featureObj | |
outFile.write(json.dumps(jsonObj) + '\n') | |
outFile.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, os | |
from subprocess import Popen, PIPE | |
from FeatureFactory import FeatureFactory | |
""" | |
Do not modify this class | |
The submit script does not use this class | |
It directly calls the methods of FeatureFactory and MEMM classes. | |
""" | |
def TrainAndTest(featureFactory, trainData, testData, printOp, ideas=None) : | |
# add the features | |
trainDataWithFeatures = featureFactory.setFeaturesTrain(trainData,ideas); | |
testDataWithFeatures = featureFactory.setFeaturesTest(testData,ideas); | |
# write the updated data into JSON files | |
featureFactory.writeData(trainDataWithFeatures, 'trainWithFeatures'); | |
featureFactory.writeData(testDataWithFeatures, 'testWithFeatures'); | |
# run MEMM | |
return Popen(['java','-cp', 'classes', '-Xmx1G' ,'MEMM' | |
,'trainWithFeatures.json', 'testWithFeatures.json', | |
printOp], stdout=PIPE).communicate()[0] | |
def calcF(output, beta) : | |
if len(output) == 0 : | |
return float('nan') | |
outputs = output.split() | |
precision = float(outputs[2]) | |
recall = float(outputs[5]) | |
if precision == 0 and recall == 0 : | |
return float('nan') | |
betasq = beta**2 | |
return (1+betasq)*precision*recall/(betasq*precision+recall) | |
def main(argv): | |
usage = 'USAGE: python NER.py trainFile testFile [-print|-greedy|-grredy-blank] F-beta Features-To-Test' | |
if len(argv) < 2: | |
print usage | |
exit(0) | |
printOp = '' | |
greedy = False | |
greedyBlank = False | |
if len(argv) > 2: | |
if argv[2] == '-print' : | |
printOp = '-print' | |
elif argv[2] == '-greedy' : | |
greedy = True | |
elif argv[2] == '-greedy-blank' : | |
greedyBlank = True | |
else : | |
print usage | |
exit(0) | |
featureFactory = FeatureFactory() | |
# read the train and test data | |
trainData = featureFactory.readData(argv[0]) | |
testData = featureFactory.readData(argv[1]) | |
if greedy or greedyBlank : | |
beta = 1 | |
if len(argv) > 3 : | |
beta = float(argv[3]) | |
leftIdeas = range(len(featureFactory.ideas)) | |
#leftIdeas = range(5) | |
if greedyBlank or len(featureFactory.goodIdeas) == 0: | |
goodIdeas = [] | |
bestF = 0. | |
else : #start with what we already discovered | |
goodIdeas = featureFactory.goodIdeas | |
leftIdeas = filter(lambda x: x not in goodIdeas, leftIdeas) | |
ideas = map(lambda x: featureFactory.ideas[x], goodIdeas) | |
output = TrainAndTest(featureFactory, trainData, testData, printOp, ideas) | |
bestF = calcF(output, beta) | |
if len(argv) > 4 : | |
leftIdeas = eval(argv[4]) | |
print 'Start Ideas:' , goodIdeas | |
print 'Left Ideas:' , leftIdeas | |
improved = True | |
while len(leftIdeas) > 0 and improved : | |
improved = False | |
F = [] | |
for i in leftIdeas : | |
goodIdeas.append(i) | |
print 'Left Ideas:' , leftIdeas | |
print 'Trying Ideas:', goodIdeas | |
ideas = map(lambda x: featureFactory.ideas[x], goodIdeas) | |
output = TrainAndTest(featureFactory, trainData, testData, printOp, ideas) | |
F.append(calcF(output, beta)) | |
print 'Left Ideas:' , leftIdeas | |
print 'Tried Ideas:', goodIdeas | |
print output | |
print 'F(alpha=%d):' % beta, bestF, '->', F | |
goodIdeas.pop() | |
maxF = max(F) | |
if maxF > bestF : | |
improved = True | |
bestIdea = leftIdeas[F.index(maxF)] | |
print 'Added Idea:', bestIdea, 'F(alpha=%d):' % beta, bestF, '->', maxF | |
bestF = maxF | |
goodIdeas.append(bestIdea) | |
leftIdeas.remove(bestIdea) | |
print goodIdeas | |
else : | |
output = TrainAndTest(featureFactory, trainData, testData, printOp) | |
print output | |
if __name__ == '__main__': | |
main(sys.argv[1:]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment