inktrap/EarleyPythonTiming.py

## earley.png

      
    Raw
  

              earley.png
            
          
## EarleyPythonTiming.py
#!/usr/bin/env python
import nltk
import sys
import datetime
import matplotlib.pyplot as plt
import csv

LIMIT=14
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> VP PP
N -> 'man'
D -> 'the'
P -> 'with'
NP -> D N
VP -> V NP
N -> 'girl'
NP -> 'John'
V -> 'saw'
N -> N PP
PP -> P NP
N -> 'telescope'
NP -> 'Mary'
""")

def parse_sentence(this_grammar, this_sent):
    parser = nltk.parse.EarleyChartParser(this_grammar)
    return parser.parse_one(this_sent)

def parse(LIMIT, grammar):
    logfile = []
    logfile.append("Run Microseconds SentenceLength NormalizedMicroseconds")
    data = []

    for i in range(0,LIMIT):
        suffix = " with the telescope" * int(i)
        suffix = suffix.split()
        sent = "the man saw the girl".split() + suffix
        # print ' '.join(sent)


        start = datetime.datetime.now()
        parse_sentence(grammar, sent)
        end = datetime.datetime.now()
        dur =  end-start

        # normalize the microseconds with the length of the sentence
        # append it to the result-list
        msec = dur.microseconds
        sentlen = len(sent)

        data.append(msec/sentlen)
        logfile.append("{0} {1} {2} {3}".format(i+1, msec, sentlen, msec/float(sentlen)))

    with open('logfile.log','w') as fh:
        fh.write('\n'.join(logfile))

def plot_data():
    with open('logfile.log','rb') as fh:
        reader = csv.reader(fh, delimiter=' ')
        headers = reader.next()
        dataIndex = headers.index('NormalizedMicroseconds')
        data = [ float(row[dataIndex]) for row in reader ]

    ## plot it and save the output
    plt.plot(data)
    plt.ylabel('Time in microseconds (normalized by sentence length)')
    plt.xlabel('Runs')
    plt.savefig('earley.png')

if __name__ == '__main__':
    if sys.argv[1] == '--plot':
        plot_data()
    else:
        parse(LIMIT, grammar)

## logfile.log
Run Microseconds SentenceLength NormalizedMicroseconds
1 1567 5 313.4
2 2507 8 313.375
3 3781 11 343.727272727
4 5553 14 396.642857143
5 8172 17 480.705882353
6 10476 20 523.8
7 15129 23 657.782608696
8 23250 26 894.230769231
9 43597 29 1503.34482759
10 141894 32 4434.1875
11 434989 35 12428.2571429
12 669095 38 17607.7631579
13 428897 41 10460.902439
14 940391 44 21372.5227273
	#!/usr/bin/env python
	import nltk
	import sys
	import datetime
	import matplotlib.pyplot as plt
	import csv

	LIMIT=14
	grammar = nltk.CFG.fromstring("""
	S -> NP VP
	VP -> VP PP
	N -> 'man'
	D -> 'the'
	P -> 'with'
	NP -> D N
	VP -> V NP
	N -> 'girl'
	NP -> 'John'
	V -> 'saw'
	N -> N PP
	PP -> P NP
	N -> 'telescope'
	NP -> 'Mary'
	""")

	def parse_sentence(this_grammar, this_sent):
	parser = nltk.parse.EarleyChartParser(this_grammar)
	return parser.parse_one(this_sent)

	def parse(LIMIT, grammar):
	logfile = []
	logfile.append("Run Microseconds SentenceLength NormalizedMicroseconds")
	data = []

	for i in range(0,LIMIT):
	suffix = " with the telescope" * int(i)
	suffix = suffix.split()
	sent = "the man saw the girl".split() + suffix
	# print ' '.join(sent)


	start = datetime.datetime.now()
	parse_sentence(grammar, sent)
	end = datetime.datetime.now()
	dur = end-start

	# normalize the microseconds with the length of the sentence
	# append it to the result-list
	msec = dur.microseconds
	sentlen = len(sent)

	data.append(msec/sentlen)
	logfile.append("{0} {1} {2} {3}".format(i+1, msec, sentlen, msec/float(sentlen)))

	with open('logfile.log','w') as fh:
	fh.write('\n'.join(logfile))

	def plot_data():
	with open('logfile.log','rb') as fh:
	reader = csv.reader(fh, delimiter=' ')
	headers = reader.next()
	dataIndex = headers.index('NormalizedMicroseconds')
	data = [ float(row[dataIndex]) for row in reader ]

	## plot it and save the output
	plt.plot(data)
	plt.ylabel('Time in microseconds (normalized by sentence length)')
	plt.xlabel('Runs')
	plt.savefig('earley.png')

	if __name__ == '__main__':
	if sys.argv[1] == '--plot':
	plot_data()
	else:
	parse(LIMIT, grammar)
	Run Microseconds SentenceLength NormalizedMicroseconds
	1 1567 5 313.4
	2 2507 8 313.375
	3 3781 11 343.727272727
	4 5553 14 396.642857143
	5 8172 17 480.705882353
	6 10476 20 523.8
	7 15129 23 657.782608696
	8 23250 26 894.230769231
	9 43597 29 1503.34482759
	10 141894 32 4434.1875
	11 434989 35 12428.2571429
	12 669095 38 17607.7631579
	13 428897 41 10460.902439
	14 940391 44 21372.5227273