/FeatureFactory.py

## FeatureFactory.py
import json, sys, collections
import base64
from Datum import Datum


class FeatureData :
    def __init__(self, words, prevLabel, position,
                 wordsDict):
        self.w = words;
        self.pL = prevLabel
        self.pos = position
        self.wD = wordsDict

class FeatureFactory:
    """
    Add any necessary initialization steps for your features here
    Using this constructor is optional. Depending on your
    features, you may not need to intialize anything.
    """
    def __init__(self):
        self.prevLabels = {};
        self.ideas = self.createFeatureIdeas()
        self.goodIdeas = [] #Once you find some good ideas, populate this list
        pass


    """
    Words is a list of the words in the entire corpus, previousLabel is the label
    for position-1 (or O if it's the start of a new sentence), and position
    is the word you are adding features for. PreviousLabel must be the
    only label that is visible to this method.
    """
    def addFeatureIdea(self, ideasList, features, conds = None) :
        if isinstance(features, collections.Iterable) :
            feature = lambda fd : \
                reduce (lambda x,y: x + y(fd) + ', ',  features, '')
        else :
            feature = features
        if conds == None :
            cond = lambda fd: True
        elif isinstance(conds, collections.Iterable) :
            cond = lambda fd : \
                reduce(lambda x,y: x and y(fd), conds, True)
        else :
            cond = conds

        tempFeature = FeatureData(['Prev', 'Curr', 'Next'], 'P', 1,
                                  {'Prev':1, 'Curr':1, 'Next':1})
        print 'Idea', len(ideasList), ':', feature(tempFeature)

        ideasList.append(lambda fd : cond(fd) and feature(fd))

    def createCase(self, t) :
        return lambda fd : 'case=' + t

    def notCond(self, cond) :
        return lambda fd : not cond(fd)

    def orConds(self, conds) :
        return lambda fd : lambda fd : \
                reduce(lambda x,y: x or y(fd), conds, False)

    def condsToFeature(self, text, conds) :
        if isinstance(conds, collections.Iterable) :
            return lambda fd : \
                text + '=' + (reduce(lambda x,y: x and y(fd), conds, False) and 'yes' or 'no')
        else :
            return lambda fd : \
                text + '=' + (conds(fd) and 'yes' or 'no')

    def createFeatureIdeas(self):
        featureIdeas = []
        currWord = lambda fd : "word=" + fd.w[fd.pos]
        prevLabel = lambda fd : "prevLabel=" + fd.pL

        currWordTitle = self.condsToFeature('CurrWordTitle', lambda fd : fd.w[fd.pos][0].isupper())

        hasPrev = lambda fd : fd.pos > 0
        prevWord = lambda fd : "prevWord=" + fd.w[fd.pos-1]

        self.addFeatureIdea(featureIdeas, currWord) #0
        self.addFeatureIdea(featureIdeas, prevLabel) #1
        self.addFeatureIdea(featureIdeas, [currWord, currWordTitle]) #2
        self.addFeatureIdea(featureIdeas, [currWord, prevLabel]) #3

        self.addFeatureIdea(featureIdeas, prevWord, hasPrev) #4
        return featureIdeas

    def computeFeatures(self, words, previousLabel, position, ideas=None):
        if position == 0 : #Build Dictionaries
            self.wordsDict = collections.defaultdict(lambda: 0)
            for word in words :
                self.wordsDict[word] += 1

        featureData = FeatureData(words, previousLabel, position,
                                  self.wordsDict)
        if ideas == None :
            ideas = map(lambda x: self.ideas[x], self.goodIdeas)
        features = []
        for idea in ideas :
            feature = idea(featureData)
            if feature :
                features.append(feature)
        return features

    """ Do not modify this method """
    def readData(self, filename):
        data = []

        for line in open(filename, 'r'):
            line_split = line.split()
            # remove emtpy lines
            if len(line_split) < 2:
                continue
            word = line_split[0]
            label = line_split[1]

            datum = Datum(word, label)
            data.append(datum)

        return data

    """ Do not modify this method """
    def readTestData(self, ch_aux):
        data = []

        for line in ch_aux.splitlines():
            line_split = line.split()
            # remove emtpy lines
            if len(line_split) < 2:
                continue
            word = line_split[0]
            label = line_split[1]

            datum = Datum(word, label)
            data.append(datum)

        return data


    """ Do not modify this method """
    def setFeaturesTrain(self, data, ideas=None):
        newData = []
        words = []

        for datum in data:
            words.append(datum.word)

        ## This is so that the feature factory code doesn't
        ## accidentally use the true label info
        previousLabel = "O"
        for i in range(0, len(data)):
            datum = data[i]

            newDatum = Datum(datum.word, datum.label)
            newDatum.features = self.computeFeatures(words, previousLabel, i, ideas)
            newDatum.previousLabel = previousLabel
            newData.append(newDatum)

            previousLabel = datum.label

        return newData

    """
    Compute the features for all possible previous labels
    for Viterbi algorithm. Do not modify this method
    """
    def setFeaturesTest(self, data, ideas=None):
        newData = []
        words = []
        labels = []
        labelIndex = {}

        for datum in data:
            words.append(datum.word)
            if not labelIndex.has_key(datum.label):
                labelIndex[datum.label] = len(labels)
                labels.append(datum.label)

        ## This is so that the feature factory code doesn't
        ## accidentally use the true label info
        for i in range(0, len(data)):
            datum = data[i]

            if i == 0:
                previousLabel = "O"
                datum.features = self.computeFeatures(words, previousLabel, i, ideas)

                newDatum = Datum(datum.word, datum.label)
                newDatum.features = self.computeFeatures(words, previousLabel, i, ideas)
                newDatum.previousLabel = previousLabel
                newData.append(newDatum)
            else:
                for previousLabel in labels:
                    datum.features = self.computeFeatures(words, previousLabel, i, ideas)

                    newDatum = Datum(datum.word, datum.label)
                    newDatum.features = self.computeFeatures(words, previousLabel, i, ideas)
                    newDatum.previousLabel = previousLabel
                    newData.append(newDatum)

        return newData

    """
    write words, labels, and features into a json file
    Do not modify this method
    """
    def writeData(self, data, filename):
        outFile = open(filename + '.json', 'w')
        for i in range(0, len(data)):
            datum = data[i]
            jsonObj = {}
            jsonObj['_label'] = datum.label
            jsonObj['_word']= base64.b64encode(datum.word)
            jsonObj['_prevLabel'] = datum.previousLabel

            featureObj = {}
            features = datum.features
            for j in range(0, len(features)):
                feature = features[j]
                featureObj['_'+feature] = feature
            jsonObj['_features'] = featureObj

            outFile.write(json.dumps(jsonObj) + '\n')

        outFile.close()

## NER.py
import sys, os
from subprocess import Popen, PIPE
from FeatureFactory import FeatureFactory

"""
Do not modify this class
The submit script does not use this class
It directly calls the methods of FeatureFactory and MEMM classes.
"""

def TrainAndTest(featureFactory, trainData, testData, printOp, ideas=None) :
    # add the features
    trainDataWithFeatures = featureFactory.setFeaturesTrain(trainData,ideas);
    testDataWithFeatures = featureFactory.setFeaturesTest(testData,ideas);
    # write the updated data into JSON files
    featureFactory.writeData(trainDataWithFeatures, 'trainWithFeatures');
    featureFactory.writeData(testDataWithFeatures, 'testWithFeatures');

    # run MEMM
    return Popen(['java','-cp', 'classes', '-Xmx1G' ,'MEMM'
                    ,'trainWithFeatures.json', 'testWithFeatures.json',
                    printOp], stdout=PIPE).communicate()[0]


def calcF(output, beta) :
    if len(output) == 0 :
        return float('nan')
    outputs = output.split()
    precision = float(outputs[2])
    recall = float(outputs[5])
    if precision == 0 and recall == 0 :
        return float('nan')
    betasq = beta**2
    return (1+betasq)*precision*recall/(betasq*precision+recall)

def main(argv):
    usage = 'USAGE: python NER.py trainFile testFile [-print|-greedy|-grredy-blank] F-beta Features-To-Test'
    if len(argv) < 2:
        print usage
        exit(0)

    printOp = ''
    greedy = False
    greedyBlank = False
    if len(argv) > 2:
        if argv[2] == '-print' :
            printOp = '-print'
        elif argv[2] == '-greedy' :
            greedy = True
        elif argv[2] == '-greedy-blank' :
            greedyBlank = True
        else :
            print usage
            exit(0)


    featureFactory = FeatureFactory()

    # read the train and test data
    trainData = featureFactory.readData(argv[0])
    testData = featureFactory.readData(argv[1])

    if greedy or greedyBlank :
        beta = 1
        if len(argv) > 3 :
            beta = float(argv[3])
        leftIdeas = range(len(featureFactory.ideas))
        #leftIdeas = range(5)


        if greedyBlank or len(featureFactory.goodIdeas) == 0:
            goodIdeas = []
            bestF = 0.
        else : #start with what we already discovered
            goodIdeas = featureFactory.goodIdeas
            leftIdeas = filter(lambda x: x not in goodIdeas, leftIdeas)
            ideas = map(lambda x: featureFactory.ideas[x], goodIdeas)
            output = TrainAndTest(featureFactory, trainData, testData, printOp, ideas)
            bestF = calcF(output, beta)

        if len(argv) > 4 :
            leftIdeas = eval(argv[4])

        print 'Start Ideas:' , goodIdeas
        print 'Left Ideas:' , leftIdeas

        improved = True
        while len(leftIdeas) > 0 and improved :
            improved = False
            F = []
            for i in leftIdeas :
                goodIdeas.append(i)
                print 'Left Ideas:' , leftIdeas
                print 'Trying Ideas:', goodIdeas
                ideas = map(lambda x: featureFactory.ideas[x], goodIdeas)
                output = TrainAndTest(featureFactory, trainData, testData, printOp, ideas)
                F.append(calcF(output, beta))
                print 'Left Ideas:' , leftIdeas
                print 'Tried Ideas:', goodIdeas
                print output
                print 'F(alpha=%d):' % beta, bestF, '->', F
                goodIdeas.pop()
            maxF = max(F)
            if maxF > bestF :
                improved = True
                bestIdea = leftIdeas[F.index(maxF)]
                print 'Added Idea:', bestIdea, 'F(alpha=%d):' % beta, bestF, '->', maxF
                bestF = maxF
                goodIdeas.append(bestIdea)
                leftIdeas.remove(bestIdea)

        print goodIdeas
    else :
        output = TrainAndTest(featureFactory, trainData, testData, printOp)
        print output

if __name__ == '__main__':
    main(sys.argv[1:])
	import json, sys, collections
	import base64
	from Datum import Datum


	class FeatureData :
	def __init__(self, words, prevLabel, position,
	wordsDict):
	self.w = words;
	self.pL = prevLabel
	self.pos = position
	self.wD = wordsDict

	class FeatureFactory:
	"""
	Add any necessary initialization steps for your features here
	Using this constructor is optional. Depending on your
	features, you may not need to intialize anything.
	"""
	def __init__(self):
	self.prevLabels = {};
	self.ideas = self.createFeatureIdeas()
	self.goodIdeas = [] #Once you find some good ideas, populate this list
	pass


	"""
	Words is a list of the words in the entire corpus, previousLabel is the label
	for position-1 (or O if it's the start of a new sentence), and position
	is the word you are adding features for. PreviousLabel must be the
	only label that is visible to this method.
	"""
	def addFeatureIdea(self, ideasList, features, conds = None) :
	if isinstance(features, collections.Iterable) :
	feature = lambda fd : \
	reduce (lambda x,y: x + y(fd) + ', ', features, '')
	else :
	feature = features
	if conds == None :
	cond = lambda fd: True
	elif isinstance(conds, collections.Iterable) :
	cond = lambda fd : \
	reduce(lambda x,y: x and y(fd), conds, True)
	else :
	cond = conds

	tempFeature = FeatureData(['Prev', 'Curr', 'Next'], 'P', 1,
	{'Prev':1, 'Curr':1, 'Next':1})
	print 'Idea', len(ideasList), ':', feature(tempFeature)

	ideasList.append(lambda fd : cond(fd) and feature(fd))

	def createCase(self, t) :
	return lambda fd : 'case=' + t

	def notCond(self, cond) :
	return lambda fd : not cond(fd)

	def orConds(self, conds) :
	return lambda fd : lambda fd : \
	reduce(lambda x,y: x or y(fd), conds, False)

	def condsToFeature(self, text, conds) :
	if isinstance(conds, collections.Iterable) :
	return lambda fd : \
	text + '=' + (reduce(lambda x,y: x and y(fd), conds, False) and 'yes' or 'no')
	else :
	return lambda fd : \
	text + '=' + (conds(fd) and 'yes' or 'no')

	def createFeatureIdeas(self):
	featureIdeas = []
	currWord = lambda fd : "word=" + fd.w[fd.pos]
	prevLabel = lambda fd : "prevLabel=" + fd.pL

	currWordTitle = self.condsToFeature('CurrWordTitle', lambda fd : fd.w[fd.pos][0].isupper())

	hasPrev = lambda fd : fd.pos > 0
	prevWord = lambda fd : "prevWord=" + fd.w[fd.pos-1]

	self.addFeatureIdea(featureIdeas, currWord) #0
	self.addFeatureIdea(featureIdeas, prevLabel) #1
	self.addFeatureIdea(featureIdeas, [currWord, currWordTitle]) #2
	self.addFeatureIdea(featureIdeas, [currWord, prevLabel]) #3

	self.addFeatureIdea(featureIdeas, prevWord, hasPrev) #4
	return featureIdeas

	def computeFeatures(self, words, previousLabel, position, ideas=None):
	if position == 0 : #Build Dictionaries
	self.wordsDict = collections.defaultdict(lambda: 0)
	for word in words :
	self.wordsDict[word] += 1

	featureData = FeatureData(words, previousLabel, position,
	self.wordsDict)
	if ideas == None :
	ideas = map(lambda x: self.ideas[x], self.goodIdeas)
	features = []
	for idea in ideas :
	feature = idea(featureData)
	if feature :
	features.append(feature)
	return features

	""" Do not modify this method """
	def readData(self, filename):
	data = []

	for line in open(filename, 'r'):
	line_split = line.split()
	# remove emtpy lines
	if len(line_split) < 2:
	continue
	word = line_split[0]
	label = line_split[1]

	datum = Datum(word, label)
	data.append(datum)

	return data

	""" Do not modify this method """
	def readTestData(self, ch_aux):
	data = []

	for line in ch_aux.splitlines():
	line_split = line.split()
	# remove emtpy lines
	if len(line_split) < 2:
	continue
	word = line_split[0]
	label = line_split[1]

	datum = Datum(word, label)
	data.append(datum)

	return data


	""" Do not modify this method """
	def setFeaturesTrain(self, data, ideas=None):
	newData = []
	words = []

	for datum in data:
	words.append(datum.word)

	## This is so that the feature factory code doesn't
	## accidentally use the true label info
	previousLabel = "O"
	for i in range(0, len(data)):
	datum = data[i]

	newDatum = Datum(datum.word, datum.label)
	newDatum.features = self.computeFeatures(words, previousLabel, i, ideas)
	newDatum.previousLabel = previousLabel
	newData.append(newDatum)

	previousLabel = datum.label

	return newData

	"""
	Compute the features for all possible previous labels
	for Viterbi algorithm. Do not modify this method
	"""
	def setFeaturesTest(self, data, ideas=None):
	newData = []
	words = []
	labels = []
	labelIndex = {}

	for datum in data:
	words.append(datum.word)
	if not labelIndex.has_key(datum.label):
	labelIndex[datum.label] = len(labels)
	labels.append(datum.label)

	## This is so that the feature factory code doesn't
	## accidentally use the true label info
	for i in range(0, len(data)):
	datum = data[i]

	if i == 0:
	previousLabel = "O"
	datum.features = self.computeFeatures(words, previousLabel, i, ideas)

	newDatum = Datum(datum.word, datum.label)
	newDatum.features = self.computeFeatures(words, previousLabel, i, ideas)
	newDatum.previousLabel = previousLabel
	newData.append(newDatum)
	else:
	for previousLabel in labels:
	datum.features = self.computeFeatures(words, previousLabel, i, ideas)

	newDatum = Datum(datum.word, datum.label)
	newDatum.features = self.computeFeatures(words, previousLabel, i, ideas)
	newDatum.previousLabel = previousLabel
	newData.append(newDatum)

	return newData

	"""
	write words, labels, and features into a json file
	Do not modify this method
	"""
	def writeData(self, data, filename):
	outFile = open(filename + '.json', 'w')
	for i in range(0, len(data)):
	datum = data[i]
	jsonObj = {}
	jsonObj['_label'] = datum.label
	jsonObj['_word']= base64.b64encode(datum.word)
	jsonObj['_prevLabel'] = datum.previousLabel

	featureObj = {}
	features = datum.features
	for j in range(0, len(features)):
	feature = features[j]
	featureObj['_'+feature] = feature
	jsonObj['_features'] = featureObj

	outFile.write(json.dumps(jsonObj) + '\n')

	outFile.close()
	import sys, os
	from subprocess import Popen, PIPE
	from FeatureFactory import FeatureFactory

	"""
	Do not modify this class
	The submit script does not use this class
	It directly calls the methods of FeatureFactory and MEMM classes.
	"""

	def TrainAndTest(featureFactory, trainData, testData, printOp, ideas=None) :
	# add the features
	trainDataWithFeatures = featureFactory.setFeaturesTrain(trainData,ideas);
	testDataWithFeatures = featureFactory.setFeaturesTest(testData,ideas);
	# write the updated data into JSON files
	featureFactory.writeData(trainDataWithFeatures, 'trainWithFeatures');
	featureFactory.writeData(testDataWithFeatures, 'testWithFeatures');

	# run MEMM
	return Popen(['java','-cp', 'classes', '-Xmx1G' ,'MEMM'
	,'trainWithFeatures.json', 'testWithFeatures.json',
	printOp], stdout=PIPE).communicate()[0]


	def calcF(output, beta) :
	if len(output) == 0 :
	return float('nan')
	outputs = output.split()
	precision = float(outputs[2])
	recall = float(outputs[5])
	if precision == 0 and recall == 0 :
	return float('nan')
	betasq = beta**2
	return (1+betasq)precisionrecall/(betasq*precision+recall)

	def main(argv):
	usage = 'USAGE: python NER.py trainFile testFile [-print\|-greedy\|-grredy-blank] F-beta Features-To-Test'
	if len(argv) < 2:
	print usage
	exit(0)

	printOp = ''
	greedy = False
	greedyBlank = False
	if len(argv) > 2:
	if argv[2] == '-print' :
	printOp = '-print'
	elif argv[2] == '-greedy' :
	greedy = True
	elif argv[2] == '-greedy-blank' :
	greedyBlank = True
	else :
	print usage
	exit(0)


	featureFactory = FeatureFactory()

	# read the train and test data
	trainData = featureFactory.readData(argv[0])
	testData = featureFactory.readData(argv[1])

	if greedy or greedyBlank :
	beta = 1
	if len(argv) > 3 :
	beta = float(argv[3])
	leftIdeas = range(len(featureFactory.ideas))
	#leftIdeas = range(5)


	if greedyBlank or len(featureFactory.goodIdeas) == 0:
	goodIdeas = []
	bestF = 0.
	else : #start with what we already discovered
	goodIdeas = featureFactory.goodIdeas
	leftIdeas = filter(lambda x: x not in goodIdeas, leftIdeas)
	ideas = map(lambda x: featureFactory.ideas[x], goodIdeas)
	output = TrainAndTest(featureFactory, trainData, testData, printOp, ideas)
	bestF = calcF(output, beta)

	if len(argv) > 4 :
	leftIdeas = eval(argv[4])

	print 'Start Ideas:' , goodIdeas
	print 'Left Ideas:' , leftIdeas

	improved = True
	while len(leftIdeas) > 0 and improved :
	improved = False
	F = []
	for i in leftIdeas :
	goodIdeas.append(i)
	print 'Left Ideas:' , leftIdeas
	print 'Trying Ideas:', goodIdeas
	ideas = map(lambda x: featureFactory.ideas[x], goodIdeas)
	output = TrainAndTest(featureFactory, trainData, testData, printOp, ideas)
	F.append(calcF(output, beta))
	print 'Left Ideas:' , leftIdeas
	print 'Tried Ideas:', goodIdeas
	print output
	print 'F(alpha=%d):' % beta, bestF, '->', F
	goodIdeas.pop()
	maxF = max(F)
	if maxF > bestF :
	improved = True
	bestIdea = leftIdeas[F.index(maxF)]
	print 'Added Idea:', bestIdea, 'F(alpha=%d):' % beta, bestF, '->', maxF
	bestF = maxF
	goodIdeas.append(bestIdea)
	leftIdeas.remove(bestIdea)

	print goodIdeas
	else :
	output = TrainAndTest(featureFactory, trainData, testData, printOp)
	print output

	if __name__ == '__main__':
	main(sys.argv[1:])