Instantly share code, notes, and snippets.

Embed
What would you like to do?
#!/usr/bin/env python
import nltk
import sys
import datetime
import matplotlib.pyplot as plt
import csv
LIMIT=14
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> VP PP
N -> 'man'
D -> 'the'
P -> 'with'
NP -> D N
VP -> V NP
N -> 'girl'
NP -> 'John'
V -> 'saw'
N -> N PP
PP -> P NP
N -> 'telescope'
NP -> 'Mary'
""")
def parse_sentence(this_grammar, this_sent):
parser = nltk.parse.EarleyChartParser(this_grammar)
return parser.parse_one(this_sent)
def parse(LIMIT, grammar):
logfile = []
logfile.append("Run Microseconds SentenceLength NormalizedMicroseconds")
data = []
for i in range(0,LIMIT):
suffix = " with the telescope" * int(i)
suffix = suffix.split()
sent = "the man saw the girl".split() + suffix
# print ' '.join(sent)
start = datetime.datetime.now()
parse_sentence(grammar, sent)
end = datetime.datetime.now()
dur = end-start
# normalize the microseconds with the length of the sentence
# append it to the result-list
msec = dur.microseconds
sentlen = len(sent)
data.append(msec/sentlen)
logfile.append("{0} {1} {2} {3}".format(i+1, msec, sentlen, msec/float(sentlen)))
with open('logfile.log','w') as fh:
fh.write('\n'.join(logfile))
def plot_data():
with open('logfile.log','rb') as fh:
reader = csv.reader(fh, delimiter=' ')
headers = reader.next()
dataIndex = headers.index('NormalizedMicroseconds')
data = [ float(row[dataIndex]) for row in reader ]
## plot it and save the output
plt.plot(data)
plt.ylabel('Time in microseconds (normalized by sentence length)')
plt.xlabel('Runs')
plt.savefig('earley.png')
if __name__ == '__main__':
if sys.argv[1] == '--plot':
plot_data()
else:
parse(LIMIT, grammar)
Run Microseconds SentenceLength NormalizedMicroseconds
1 1567 5 313.4
2 2507 8 313.375
3 3781 11 343.727272727
4 5553 14 396.642857143
5 8172 17 480.705882353
6 10476 20 523.8
7 15129 23 657.782608696
8 23250 26 894.230769231
9 43597 29 1503.34482759
10 141894 32 4434.1875
11 434989 35 12428.2571429
12 669095 38 17607.7631579
13 428897 41 10460.902439
14 940391 44 21372.5227273
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment