Created
July 2, 2011 04:07
-
-
Save iki/1059725 to your computer and use it in GitHub Desktop.
Test wordlist parsing speed, see http://news.ycombinator.com/item?id=2716714
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Test wordlist parsing speed. | |
See http://news.ycombinator.com/item?id=2716714 | |
Use with /usr/share/dict/words, | |
or e.g. http://www.freebsd.org/cgi/cvsweb.cgi/src/share/dict/web2?rev=1.14;content-type=text/plain | |
""" | |
import pickle | |
WORDFILE = '/usr/share/dict/words' | |
def words(wordfile=WORDFILE): | |
with open(wordfile) as wordlist: | |
return frozenset(wordlist.read().split('\n')) | |
def words_slower(wordfile=WORDFILE): | |
with open(wordfile) as wordlist: | |
return frozenset(line[:-1] for line in wordlist) | |
def words_normalized(wordfile=WORDFILE): | |
with open(wordfile) as wordlist: | |
return frozenset(line.rstrip().lower() for line in wordlist) | |
def words_pickled(pickled): | |
with open(pickled, 'rb') as wordlist: | |
return pickle.load(wordlist) | |
def words_pickle(words, pickled): | |
with open(pickled, 'wb', -1) as wordlist: | |
pickle.dump(words, wordlist) | |
if __name__ == '__main__': | |
import sys | |
import optparse | |
parser = optparse.OptionParser() | |
parser.add_option('-w', '--wordlist', metavar='FILE', | |
help='load normalized wordlist FILE') | |
parser.add_option('-n', '--normalize', metavar='FILE', | |
help='load and normalize wordlist FILE') | |
parser.add_option('-u', '--unpickle', metavar='PICKLE', | |
help='load wordlist PICKLE') | |
parser.add_option('-p', '--pickle', metavar='PICKLE', | |
help='save wordlist to PICKLE') | |
parser.add_option('-T', '--timeit', metavar='FILE PICKLE LOOPS REPEAT', | |
nargs=4, | |
help='measure load times of FILE and PICKLE') | |
options, args = parser.parse_args() | |
if args: | |
sys.stderr.write('Excessive arguments: %s' % ' '.join(args)) | |
sys.exit(2) | |
if options.timeit: | |
import timeit | |
wordfile, pickled = options.timeit[:2] | |
loops, repeat = map(int, options.timeit[2:]) | |
names = [ n for n, f in locals().items() if callable(f) ] | |
setup = 'from __main__ import wordfile, pickled, %s' % ', '.join(names) | |
sys.stdout.write('%s\n' % sys.version) | |
for stmt in ''' | |
words(wordfile) | |
words_slower(wordfile) | |
words_normalized(wordfile) | |
words_pickled(pickled) | |
'''.split(): | |
sys.stdout.write('%s: %.2f best msec/loop\n' % (stmt, | |
min(timeit.repeat(stmt, setup, repeat=repeat, number=loops)) * 100)) | |
raise SystemExit | |
elif options.wordlist: | |
w = words(options.wordlist) | |
elif options.normalize: | |
w = words_normalized(options.normalize) | |
elif options.unpickle: | |
w = words_pickled(options.unpickle) | |
else: | |
sys.stderr.write('Use either -w, -n, -u, or -T option.') | |
sys.exit(2) | |
if options.pickle: | |
words_pickle(w, options.pickle) | |
else: | |
sys.stdout.write('\n'.join(sorted(w))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment