Skip to content

Instantly share code, notes, and snippets.

Created April 8, 2012 14:04
Show Gist options
  • Save anonymous/2337478 to your computer and use it in GitHub Desktop.
Save anonymous/2337478 to your computer and use it in GitHub Desktop.
NLP PA4 Greedy Search
import json, sys, collections
import base64
from Datum import Datum
class FeatureData :
def __init__(self, words, prevLabel, position,
wordsDict):
self.w = words;
self.pL = prevLabel
self.pos = position
self.wD = wordsDict
class FeatureFactory:
"""
Add any necessary initialization steps for your features here
Using this constructor is optional. Depending on your
features, you may not need to intialize anything.
"""
def __init__(self):
self.prevLabels = {};
self.ideas = self.createFeatureIdeas()
self.goodIdeas = [] #Once you find some good ideas, populate this list
pass
"""
Words is a list of the words in the entire corpus, previousLabel is the label
for position-1 (or O if it's the start of a new sentence), and position
is the word you are adding features for. PreviousLabel must be the
only label that is visible to this method.
"""
def addFeatureIdea(self, ideasList, features, conds = None) :
if isinstance(features, collections.Iterable) :
feature = lambda fd : \
reduce (lambda x,y: x + y(fd) + ', ', features, '')
else :
feature = features
if conds == None :
cond = lambda fd: True
elif isinstance(conds, collections.Iterable) :
cond = lambda fd : \
reduce(lambda x,y: x and y(fd), conds, True)
else :
cond = conds
tempFeature = FeatureData(['Prev', 'Curr', 'Next'], 'P', 1,
{'Prev':1, 'Curr':1, 'Next':1})
print 'Idea', len(ideasList), ':', feature(tempFeature)
ideasList.append(lambda fd : cond(fd) and feature(fd))
def createCase(self, t) :
return lambda fd : 'case=' + t
def notCond(self, cond) :
return lambda fd : not cond(fd)
def orConds(self, conds) :
return lambda fd : lambda fd : \
reduce(lambda x,y: x or y(fd), conds, False)
def condsToFeature(self, text, conds) :
if isinstance(conds, collections.Iterable) :
return lambda fd : \
text + '=' + (reduce(lambda x,y: x and y(fd), conds, False) and 'yes' or 'no')
else :
return lambda fd : \
text + '=' + (conds(fd) and 'yes' or 'no')
def createFeatureIdeas(self):
featureIdeas = []
currWord = lambda fd : "word=" + fd.w[fd.pos]
prevLabel = lambda fd : "prevLabel=" + fd.pL
currWordTitle = self.condsToFeature('CurrWordTitle', lambda fd : fd.w[fd.pos][0].isupper())
hasPrev = lambda fd : fd.pos > 0
prevWord = lambda fd : "prevWord=" + fd.w[fd.pos-1]
self.addFeatureIdea(featureIdeas, currWord) #0
self.addFeatureIdea(featureIdeas, prevLabel) #1
self.addFeatureIdea(featureIdeas, [currWord, currWordTitle]) #2
self.addFeatureIdea(featureIdeas, [currWord, prevLabel]) #3
self.addFeatureIdea(featureIdeas, prevWord, hasPrev) #4
return featureIdeas
def computeFeatures(self, words, previousLabel, position, ideas=None):
if position == 0 : #Build Dictionaries
self.wordsDict = collections.defaultdict(lambda: 0)
for word in words :
self.wordsDict[word] += 1
featureData = FeatureData(words, previousLabel, position,
self.wordsDict)
if ideas == None :
ideas = map(lambda x: self.ideas[x], self.goodIdeas)
features = []
for idea in ideas :
feature = idea(featureData)
if feature :
features.append(feature)
return features
""" Do not modify this method """
def readData(self, filename):
data = []
for line in open(filename, 'r'):
line_split = line.split()
# remove emtpy lines
if len(line_split) < 2:
continue
word = line_split[0]
label = line_split[1]
datum = Datum(word, label)
data.append(datum)
return data
""" Do not modify this method """
def readTestData(self, ch_aux):
data = []
for line in ch_aux.splitlines():
line_split = line.split()
# remove emtpy lines
if len(line_split) < 2:
continue
word = line_split[0]
label = line_split[1]
datum = Datum(word, label)
data.append(datum)
return data
""" Do not modify this method """
def setFeaturesTrain(self, data, ideas=None):
newData = []
words = []
for datum in data:
words.append(datum.word)
## This is so that the feature factory code doesn't
## accidentally use the true label info
previousLabel = "O"
for i in range(0, len(data)):
datum = data[i]
newDatum = Datum(datum.word, datum.label)
newDatum.features = self.computeFeatures(words, previousLabel, i, ideas)
newDatum.previousLabel = previousLabel
newData.append(newDatum)
previousLabel = datum.label
return newData
"""
Compute the features for all possible previous labels
for Viterbi algorithm. Do not modify this method
"""
def setFeaturesTest(self, data, ideas=None):
newData = []
words = []
labels = []
labelIndex = {}
for datum in data:
words.append(datum.word)
if not labelIndex.has_key(datum.label):
labelIndex[datum.label] = len(labels)
labels.append(datum.label)
## This is so that the feature factory code doesn't
## accidentally use the true label info
for i in range(0, len(data)):
datum = data[i]
if i == 0:
previousLabel = "O"
datum.features = self.computeFeatures(words, previousLabel, i, ideas)
newDatum = Datum(datum.word, datum.label)
newDatum.features = self.computeFeatures(words, previousLabel, i, ideas)
newDatum.previousLabel = previousLabel
newData.append(newDatum)
else:
for previousLabel in labels:
datum.features = self.computeFeatures(words, previousLabel, i, ideas)
newDatum = Datum(datum.word, datum.label)
newDatum.features = self.computeFeatures(words, previousLabel, i, ideas)
newDatum.previousLabel = previousLabel
newData.append(newDatum)
return newData
"""
write words, labels, and features into a json file
Do not modify this method
"""
def writeData(self, data, filename):
outFile = open(filename + '.json', 'w')
for i in range(0, len(data)):
datum = data[i]
jsonObj = {}
jsonObj['_label'] = datum.label
jsonObj['_word']= base64.b64encode(datum.word)
jsonObj['_prevLabel'] = datum.previousLabel
featureObj = {}
features = datum.features
for j in range(0, len(features)):
feature = features[j]
featureObj['_'+feature] = feature
jsonObj['_features'] = featureObj
outFile.write(json.dumps(jsonObj) + '\n')
outFile.close()
import sys, os
from subprocess import Popen, PIPE
from FeatureFactory import FeatureFactory
"""
Do not modify this class
The submit script does not use this class
It directly calls the methods of FeatureFactory and MEMM classes.
"""
def TrainAndTest(featureFactory, trainData, testData, printOp, ideas=None) :
# add the features
trainDataWithFeatures = featureFactory.setFeaturesTrain(trainData,ideas);
testDataWithFeatures = featureFactory.setFeaturesTest(testData,ideas);
# write the updated data into JSON files
featureFactory.writeData(trainDataWithFeatures, 'trainWithFeatures');
featureFactory.writeData(testDataWithFeatures, 'testWithFeatures');
# run MEMM
return Popen(['java','-cp', 'classes', '-Xmx1G' ,'MEMM'
,'trainWithFeatures.json', 'testWithFeatures.json',
printOp], stdout=PIPE).communicate()[0]
def calcF(output, beta) :
if len(output) == 0 :
return float('nan')
outputs = output.split()
precision = float(outputs[2])
recall = float(outputs[5])
if precision == 0 and recall == 0 :
return float('nan')
betasq = beta**2
return (1+betasq)*precision*recall/(betasq*precision+recall)
def main(argv):
usage = 'USAGE: python NER.py trainFile testFile [-print|-greedy|-grredy-blank] F-beta Features-To-Test'
if len(argv) < 2:
print usage
exit(0)
printOp = ''
greedy = False
greedyBlank = False
if len(argv) > 2:
if argv[2] == '-print' :
printOp = '-print'
elif argv[2] == '-greedy' :
greedy = True
elif argv[2] == '-greedy-blank' :
greedyBlank = True
else :
print usage
exit(0)
featureFactory = FeatureFactory()
# read the train and test data
trainData = featureFactory.readData(argv[0])
testData = featureFactory.readData(argv[1])
if greedy or greedyBlank :
beta = 1
if len(argv) > 3 :
beta = float(argv[3])
leftIdeas = range(len(featureFactory.ideas))
#leftIdeas = range(5)
if greedyBlank or len(featureFactory.goodIdeas) == 0:
goodIdeas = []
bestF = 0.
else : #start with what we already discovered
goodIdeas = featureFactory.goodIdeas
leftIdeas = filter(lambda x: x not in goodIdeas, leftIdeas)
ideas = map(lambda x: featureFactory.ideas[x], goodIdeas)
output = TrainAndTest(featureFactory, trainData, testData, printOp, ideas)
bestF = calcF(output, beta)
if len(argv) > 4 :
leftIdeas = eval(argv[4])
print 'Start Ideas:' , goodIdeas
print 'Left Ideas:' , leftIdeas
improved = True
while len(leftIdeas) > 0 and improved :
improved = False
F = []
for i in leftIdeas :
goodIdeas.append(i)
print 'Left Ideas:' , leftIdeas
print 'Trying Ideas:', goodIdeas
ideas = map(lambda x: featureFactory.ideas[x], goodIdeas)
output = TrainAndTest(featureFactory, trainData, testData, printOp, ideas)
F.append(calcF(output, beta))
print 'Left Ideas:' , leftIdeas
print 'Tried Ideas:', goodIdeas
print output
print 'F(alpha=%d):' % beta, bestF, '->', F
goodIdeas.pop()
maxF = max(F)
if maxF > bestF :
improved = True
bestIdea = leftIdeas[F.index(maxF)]
print 'Added Idea:', bestIdea, 'F(alpha=%d):' % beta, bestF, '->', maxF
bestF = maxF
goodIdeas.append(bestIdea)
leftIdeas.remove(bestIdea)
print goodIdeas
else :
output = TrainAndTest(featureFactory, trainData, testData, printOp)
print output
if __name__ == '__main__':
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment