Skip to content

Instantly share code, notes, and snippets.

@zed
Created December 1, 2012 03:21
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zed/4180407 to your computer and use it in GitHub Desktop.
Save zed/4180407 to your computer and use it in GitHub Desktop.
Python: measure time performance of counting words in corpus/gutenberg
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
http://stackoverflow.com/q/13655169
"""
import doctest
import inspect
import os
import sys
import timeit
from collections import Counter, defaultdict
from functools import partial, wraps
from itertools import groupby, imap, product
from string import ascii_letters, ascii_lowercase, maketrans
from timeit import default_timer as timer
import nltk # pip install nltk
WORDS = nltk.corpus.gutenberg.words()
def count_words_Counter(words):
return sorted(Counter(words).items())
def count_words_groupby(words):
return [(w, len(list(gr))) for w, gr in groupby(sorted(words))]
def count_words_groupby_sum(words):
return [(w, sum(1 for _ in gr)) for w, gr in groupby(sorted(words))]
def count_words_defaultdict(words):
d = defaultdict(int)
for w in words:
d[w] += 1
return sorted(d.items())
def count_words_dict(words):
d = {}
for w in words:
try:
d[w] += 1
except KeyError:
d[w] = 1
return sorted(d.items())
def _count_words_freqdist(words):
# note: .items() returns words sorted by word frequency (descreasing order)
# (same as `Counter.most_common()`)
# so the code sorts twice (the second time in lexicographical order)
return sorted(nltk.FreqDist(words).items())
def toascii_letter_lower_genexpr(s, _letter_set=ascii_lowercase):
"""
>>> toascii_letter_lower_genexpr("ABC,-.!def")
'abcdef'
"""
return ''.join(c for c in s.lower() if c in _letter_set)
def toascii_letter_lower_genexpr_set(s, _letter_set=set(ascii_lowercase)):
return ''.join(c for c in s.lower() if c in _letter_set)
def toascii_letter_lower_translate(s,
table=maketrans(ascii_letters, ascii_lowercase * 2),
deletechars=''.join(set(maketrans('', '')) - set(ascii_letters))):
return s.translate(table, deletechars)
def toascii_letter_lower_filter(s, _letter_set=set(ascii_letters)):
return filter(_letter_set.__contains__, s).lower()
def compose(*funcs):
return lambda arg: reduce(lambda x, g: g(x), reversed(funcs), arg)
def get_functions_with_prefix(prefix):
all_funcs = inspect.getmembers(sys.modules[__name__], inspect.isfunction)
return [f for name, f in all_funcs if name.startswith(prefix)]
def accept_funcs(func):
"""Add function names if necessary.
Decorator for functions that accept either a sequence of functions
or (name, function) pairs.
"""
@wraps(func)
def wrapper(funcs, *args, **kwargs):
if hasattr(funcs[0], '__name__'):
funcs = [(f.__name__, f) for f in funcs]
return func(funcs, *args, **kwargs)
return wrapper
@accept_funcs
def test_funcs(funcs):
"""Check that all functions produce the same result.
Each function should accept a readonly sequence of words as an argument
"""
words = list(WORDS[:10])
expected = funcs[0][1](words)
for name, f in funcs:
assert f(words) == expected, (name, words, expected)
@accept_funcs
def test_word_funcs(funcs):
"""Check that all functions produce the same result.
Each function should accept a word as an argument
"""
for name, f in funcs[1:]:
for w in WORDS[:1000]:
assert f(w) == funcs[0][1](w), (name, w, funcs[0][1](w))
def measure_func(f, args):
"""Measure how long `f(*args)` takes in seconds."""
n = 1
while True:
start = timer()
r = timeit.repeat(partial(f, *args), number=n, repeat=1)
if timer() - start > 1: # at least 1 second per measurement
break
n *= 2
return min(r + timeit.repeat(partial(f, *args), number=n, repeat=2)) / n
def human_seconds(seconds, fmt="%.3g %s"):
"""Return human-readable string that represents given seconds."""
t = 1e6 * seconds # start with µsec
for suff in "usec msec".split():
if t < 1000:
return fmt % (t, suff)
t /= 1000
return fmt % (t, "sec")
@accept_funcs
def measure(funcs, args=[list(WORDS)], comment=''):
"""Report how long `f(*args)` takes for each f in funcs."""
w = max(len(name) for name, _ in funcs)
for name, f in funcs:
print("{:{}s} {:8s} {}".format(
name, w, human_seconds(measure_func(f, args)), comment))
def combine_counts(items):
d = defaultdict(int)
for word, count in items:
d[word] += count
return d.iteritems()
def clean_words_in_items(clean_word, items):
return ((clean_word(word), count) for word, count in items)
def normalize_count_words(words):
"""Normalize then count words."""
return count_words_defaultdict(imap(toascii_letter_lower_translate, words))
def count_normalize_words(words):
"""Count then normalize words."""
freqs = count_words_defaultdict(words)
freqs = clean_words_in_items(toascii_letter_lower_translate, freqs)
return sorted(combine_counts(freqs))
def main():
# test
doctest.testmod()
count_words_funcs = get_functions_with_prefix('count_words_')
toascii_funcs = get_functions_with_prefix('toascii_letter_lower_')
funcs = [count_normalize_words, normalize_count_words]
before_funcs = [('{} < map {}'.format(f.__name__, g.__name__),
compose(f, partial(imap, g)))
for f, g in product(count_words_funcs, toascii_funcs)]
after_funcs = [('sorted < combine_counts < map {} < {}'.format(
g.__name__, f.__name__),
compose(sorted, combine_counts,
partial(clean_words_in_items, g), f))
for f, g in product(count_words_funcs, toascii_funcs)]
test_funcs(count_words_funcs)
test_word_funcs(toascii_funcs)
test_funcs(funcs)
test_funcs(before_funcs + after_funcs)
# measure performance
for comment, s in zip(["small", "random 2000"],
["ABC.,!-def", os.urandom(2000)]):
measure(toascii_funcs, [s], comment)
print("")
measure([sorted] + count_words_funcs)
print("")
measure(funcs)
print("")
print("Convert to ascii-lowercase/remove non-alpha before counting words")
measure(before_funcs)
print("")
print("Convert to ascii-lowercase/remove non-alpha after counting words")
measure(after_funcs)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment