Skip to content

Instantly share code, notes, and snippets.

@iki
Created July 2, 2011 04:07
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iki/1059725 to your computer and use it in GitHub Desktop.
Save iki/1059725 to your computer and use it in GitHub Desktop.
Test wordlist parsing speed, see http://news.ycombinator.com/item?id=2716714
#!/usr/bin/env python
"""Test wordlist parsing speed.
See http://news.ycombinator.com/item?id=2716714
Use with /usr/share/dict/words,
or e.g. http://www.freebsd.org/cgi/cvsweb.cgi/src/share/dict/web2?rev=1.14;content-type=text/plain
"""
import pickle
WORDFILE = '/usr/share/dict/words'
def words(wordfile=WORDFILE):
with open(wordfile) as wordlist:
return frozenset(wordlist.read().split('\n'))
def words_slower(wordfile=WORDFILE):
with open(wordfile) as wordlist:
return frozenset(line[:-1] for line in wordlist)
def words_normalized(wordfile=WORDFILE):
with open(wordfile) as wordlist:
return frozenset(line.rstrip().lower() for line in wordlist)
def words_pickled(pickled):
with open(pickled, 'rb') as wordlist:
return pickle.load(wordlist)
def words_pickle(words, pickled):
with open(pickled, 'wb', -1) as wordlist:
pickle.dump(words, wordlist)
if __name__ == '__main__':
import sys
import optparse
parser = optparse.OptionParser()
parser.add_option('-w', '--wordlist', metavar='FILE',
help='load normalized wordlist FILE')
parser.add_option('-n', '--normalize', metavar='FILE',
help='load and normalize wordlist FILE')
parser.add_option('-u', '--unpickle', metavar='PICKLE',
help='load wordlist PICKLE')
parser.add_option('-p', '--pickle', metavar='PICKLE',
help='save wordlist to PICKLE')
parser.add_option('-T', '--timeit', metavar='FILE PICKLE LOOPS REPEAT',
nargs=4,
help='measure load times of FILE and PICKLE')
options, args = parser.parse_args()
if args:
sys.stderr.write('Excessive arguments: %s' % ' '.join(args))
sys.exit(2)
if options.timeit:
import timeit
wordfile, pickled = options.timeit[:2]
loops, repeat = map(int, options.timeit[2:])
names = [ n for n, f in locals().items() if callable(f) ]
setup = 'from __main__ import wordfile, pickled, %s' % ', '.join(names)
sys.stdout.write('%s\n' % sys.version)
for stmt in '''
words(wordfile)
words_slower(wordfile)
words_normalized(wordfile)
words_pickled(pickled)
'''.split():
sys.stdout.write('%s: %.2f best msec/loop\n' % (stmt,
min(timeit.repeat(stmt, setup, repeat=repeat, number=loops)) * 100))
raise SystemExit
elif options.wordlist:
w = words(options.wordlist)
elif options.normalize:
w = words_normalized(options.normalize)
elif options.unpickle:
w = words_pickled(options.unpickle)
else:
sys.stderr.write('Use either -w, -n, -u, or -T option.')
sys.exit(2)
if options.pickle:
words_pickle(w, options.pickle)
else:
sys.stdout.write('\n'.join(sorted(w)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment