Created
December 1, 2012 03:21
-
-
Save zed/4180407 to your computer and use it in GitHub Desktop.
Python: measure time performance of counting words in corpus/gutenberg
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
http://stackoverflow.com/q/13655169 | |
""" | |
import doctest | |
import inspect | |
import os | |
import sys | |
import timeit | |
from collections import Counter, defaultdict | |
from functools import partial, wraps | |
from itertools import groupby, imap, product | |
from string import ascii_letters, ascii_lowercase, maketrans | |
from timeit import default_timer as timer | |
import nltk # pip install nltk | |
WORDS = nltk.corpus.gutenberg.words() | |
def count_words_Counter(words): | |
return sorted(Counter(words).items()) | |
def count_words_groupby(words): | |
return [(w, len(list(gr))) for w, gr in groupby(sorted(words))] | |
def count_words_groupby_sum(words): | |
return [(w, sum(1 for _ in gr)) for w, gr in groupby(sorted(words))] | |
def count_words_defaultdict(words): | |
d = defaultdict(int) | |
for w in words: | |
d[w] += 1 | |
return sorted(d.items()) | |
def count_words_dict(words): | |
d = {} | |
for w in words: | |
try: | |
d[w] += 1 | |
except KeyError: | |
d[w] = 1 | |
return sorted(d.items()) | |
def _count_words_freqdist(words): | |
# note: .items() returns words sorted by word frequency (descreasing order) | |
# (same as `Counter.most_common()`) | |
# so the code sorts twice (the second time in lexicographical order) | |
return sorted(nltk.FreqDist(words).items()) | |
def toascii_letter_lower_genexpr(s, _letter_set=ascii_lowercase): | |
""" | |
>>> toascii_letter_lower_genexpr("ABC,-.!def") | |
'abcdef' | |
""" | |
return ''.join(c for c in s.lower() if c in _letter_set) | |
def toascii_letter_lower_genexpr_set(s, _letter_set=set(ascii_lowercase)): | |
return ''.join(c for c in s.lower() if c in _letter_set) | |
def toascii_letter_lower_translate(s, | |
table=maketrans(ascii_letters, ascii_lowercase * 2), | |
deletechars=''.join(set(maketrans('', '')) - set(ascii_letters))): | |
return s.translate(table, deletechars) | |
def toascii_letter_lower_filter(s, _letter_set=set(ascii_letters)): | |
return filter(_letter_set.__contains__, s).lower() | |
def compose(*funcs): | |
return lambda arg: reduce(lambda x, g: g(x), reversed(funcs), arg) | |
def get_functions_with_prefix(prefix): | |
all_funcs = inspect.getmembers(sys.modules[__name__], inspect.isfunction) | |
return [f for name, f in all_funcs if name.startswith(prefix)] | |
def accept_funcs(func): | |
"""Add function names if necessary. | |
Decorator for functions that accept either a sequence of functions | |
or (name, function) pairs. | |
""" | |
@wraps(func) | |
def wrapper(funcs, *args, **kwargs): | |
if hasattr(funcs[0], '__name__'): | |
funcs = [(f.__name__, f) for f in funcs] | |
return func(funcs, *args, **kwargs) | |
return wrapper | |
@accept_funcs | |
def test_funcs(funcs): | |
"""Check that all functions produce the same result. | |
Each function should accept a readonly sequence of words as an argument | |
""" | |
words = list(WORDS[:10]) | |
expected = funcs[0][1](words) | |
for name, f in funcs: | |
assert f(words) == expected, (name, words, expected) | |
@accept_funcs | |
def test_word_funcs(funcs): | |
"""Check that all functions produce the same result. | |
Each function should accept a word as an argument | |
""" | |
for name, f in funcs[1:]: | |
for w in WORDS[:1000]: | |
assert f(w) == funcs[0][1](w), (name, w, funcs[0][1](w)) | |
def measure_func(f, args): | |
"""Measure how long `f(*args)` takes in seconds.""" | |
n = 1 | |
while True: | |
start = timer() | |
r = timeit.repeat(partial(f, *args), number=n, repeat=1) | |
if timer() - start > 1: # at least 1 second per measurement | |
break | |
n *= 2 | |
return min(r + timeit.repeat(partial(f, *args), number=n, repeat=2)) / n | |
def human_seconds(seconds, fmt="%.3g %s"): | |
"""Return human-readable string that represents given seconds.""" | |
t = 1e6 * seconds # start with µsec | |
for suff in "usec msec".split(): | |
if t < 1000: | |
return fmt % (t, suff) | |
t /= 1000 | |
return fmt % (t, "sec") | |
@accept_funcs | |
def measure(funcs, args=[list(WORDS)], comment=''): | |
"""Report how long `f(*args)` takes for each f in funcs.""" | |
w = max(len(name) for name, _ in funcs) | |
for name, f in funcs: | |
print("{:{}s} {:8s} {}".format( | |
name, w, human_seconds(measure_func(f, args)), comment)) | |
def combine_counts(items): | |
d = defaultdict(int) | |
for word, count in items: | |
d[word] += count | |
return d.iteritems() | |
def clean_words_in_items(clean_word, items): | |
return ((clean_word(word), count) for word, count in items) | |
def normalize_count_words(words): | |
"""Normalize then count words.""" | |
return count_words_defaultdict(imap(toascii_letter_lower_translate, words)) | |
def count_normalize_words(words): | |
"""Count then normalize words.""" | |
freqs = count_words_defaultdict(words) | |
freqs = clean_words_in_items(toascii_letter_lower_translate, freqs) | |
return sorted(combine_counts(freqs)) | |
def main(): | |
# test | |
doctest.testmod() | |
count_words_funcs = get_functions_with_prefix('count_words_') | |
toascii_funcs = get_functions_with_prefix('toascii_letter_lower_') | |
funcs = [count_normalize_words, normalize_count_words] | |
before_funcs = [('{} < map {}'.format(f.__name__, g.__name__), | |
compose(f, partial(imap, g))) | |
for f, g in product(count_words_funcs, toascii_funcs)] | |
after_funcs = [('sorted < combine_counts < map {} < {}'.format( | |
g.__name__, f.__name__), | |
compose(sorted, combine_counts, | |
partial(clean_words_in_items, g), f)) | |
for f, g in product(count_words_funcs, toascii_funcs)] | |
test_funcs(count_words_funcs) | |
test_word_funcs(toascii_funcs) | |
test_funcs(funcs) | |
test_funcs(before_funcs + after_funcs) | |
# measure performance | |
for comment, s in zip(["small", "random 2000"], | |
["ABC.,!-def", os.urandom(2000)]): | |
measure(toascii_funcs, [s], comment) | |
print("") | |
measure([sorted] + count_words_funcs) | |
print("") | |
measure(funcs) | |
print("") | |
print("Convert to ascii-lowercase/remove non-alpha before counting words") | |
measure(before_funcs) | |
print("") | |
print("Convert to ascii-lowercase/remove non-alpha after counting words") | |
measure(after_funcs) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment