Skip to content

Instantly share code, notes, and snippets.

@zed
Created December 1, 2012 03:21
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zed/4180407 to your computer and use it in GitHub Desktop.
Save zed/4180407 to your computer and use it in GitHub Desktop.
Python: measure time performance of counting words in corpus/gutenberg
#!/usr/bin/env python
"""
http://stackoverflow.com/q/13655169
"""
import inspect
import sys
import timeit
from collections import Counter, defaultdict
from functools import partial
from itertools import groupby
import nltk # pip install nltk
WORDS = nltk.corpus.gutenberg.words()
def count_words_Counter(words):
return sorted(Counter(words).items())
def count_words_groupby(words):
return [(w, len(list(gr))) for w, gr in groupby(sorted(words))]
def count_words_groupby_sum(words):
return [(w, sum(1 for _ in gr)) for w, gr in groupby(sorted(words))]
def count_words_defaultdict(words):
d = defaultdict(int)
for w in words:
d[w] += 1
return sorted(d.items())
def count_words_dict(words):
d = {}
for w in words:
try:
d[w] += 1
except KeyError:
d[w] = 1
return sorted(d.items())
def count_words_freqdist(words):
# note: .items() returns words sorted by word frequency (descreasing order)
# (same as `Counter.most_common()`)
# so the code sorts twice (the second time in alphabetical order)
return sorted(nltk.FreqDist(words).items())
def get_count_words_functions():
all_funcs = inspect.getmembers(sys.modules[__name__], inspect.isfunction)
return [f for name, f in all_funcs if name.startswith('count_words_')]
def test_func(f, words, expected):
assert f(words) == expected
def test():
funcs = get_count_words_functions()
words = list(WORDS[:1000])
expected = funcs[0](words)
for f in funcs:
test_func(f, words, expected)
def measure_func(f, words):
return min(timeit.repeat(partial(f, words), number=1))
def measure():
funcs = get_count_words_functions()
words = list(WORDS)
w = max(len(f.__name__) for f in funcs)
for f in [sorted] + funcs:
print("{:{}s} {:.2f}".format(f.__name__, w, measure_func(f, words)))
test()
measure()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment