zed/count-words-performance.py

## count-words-performance.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
http://stackoverflow.com/q/13655169
"""
import doctest
import inspect
import os
import sys
import timeit
from collections import Counter, defaultdict
from functools import partial, wraps
from itertools import groupby, imap, product
from string import ascii_letters, ascii_lowercase, maketrans
from timeit import default_timer as timer

import nltk  # pip install nltk

WORDS = nltk.corpus.gutenberg.words()

def count_words_Counter(words):
    return sorted(Counter(words).items())

def count_words_groupby(words):
    return [(w, len(list(gr))) for w, gr in groupby(sorted(words))]

def count_words_groupby_sum(words):
    return [(w, sum(1 for _ in gr)) for w, gr in groupby(sorted(words))]

def count_words_defaultdict(words):
    d = defaultdict(int)
    for w in words:
        d[w] += 1
    return sorted(d.items())

def count_words_dict(words):
    d = {}
    for w in words:
        try:
            d[w] += 1
        except KeyError:
            d[w] = 1
    return sorted(d.items())

def _count_words_freqdist(words):
    # note: .items() returns words sorted by word frequency (descreasing order)
    #       (same as `Counter.most_common()`)
    #       so the code sorts twice (the second time in lexicographical order)
    return sorted(nltk.FreqDist(words).items())

def toascii_letter_lower_genexpr(s, _letter_set=ascii_lowercase):
    """
    >>> toascii_letter_lower_genexpr("ABC,-.!def")
    'abcdef'
    """
    return ''.join(c for c in s.lower() if c in _letter_set)

def toascii_letter_lower_genexpr_set(s, _letter_set=set(ascii_lowercase)):
    return ''.join(c for c in s.lower() if c in _letter_set)

def toascii_letter_lower_translate(s,
    table=maketrans(ascii_letters, ascii_lowercase * 2),
    deletechars=''.join(set(maketrans('', '')) - set(ascii_letters))):
    return s.translate(table, deletechars)

def toascii_letter_lower_filter(s, _letter_set=set(ascii_letters)):
    return filter(_letter_set.__contains__, s).lower()

def compose(*funcs):
    return lambda arg: reduce(lambda x, g: g(x), reversed(funcs), arg)

def get_functions_with_prefix(prefix):
    all_funcs = inspect.getmembers(sys.modules[__name__], inspect.isfunction)
    return [f for name, f in all_funcs if name.startswith(prefix)]

def accept_funcs(func):
    """Add function names if necessary.

    Decorator for functions that accept either a sequence of functions
    or (name, function) pairs.
    """
    @wraps(func)
    def wrapper(funcs, *args, **kwargs):
        if hasattr(funcs[0], '__name__'):
            funcs = [(f.__name__, f) for f in funcs]
        return func(funcs, *args, **kwargs)
    return wrapper

@accept_funcs
def test_funcs(funcs):
    """Check that all functions produce the same result.

    Each function should accept a readonly sequence of words as an argument
    """
    words = list(WORDS[:10])
    expected = funcs[0][1](words)

    for name, f in funcs:
        assert f(words) == expected, (name, words, expected)

@accept_funcs
def test_word_funcs(funcs):
    """Check that all functions produce the same result.

    Each function should accept a word as an argument
    """
    for name, f in funcs[1:]:
        for w in WORDS[:1000]:
            assert f(w) == funcs[0][1](w), (name, w, funcs[0][1](w))

def measure_func(f, args):
    """Measure how long `f(*args)` takes in seconds."""
    n = 1
    while True:
        start = timer()
        r = timeit.repeat(partial(f, *args), number=n, repeat=1)
        if timer() - start > 1: # at least 1 second per measurement
            break
        n *= 2
    return min(r + timeit.repeat(partial(f, *args), number=n, repeat=2)) / n

def human_seconds(seconds, fmt="%.3g %s"):
    """Return human-readable string that represents given seconds."""
    t = 1e6 * seconds # start with µsec
    for suff in "usec msec".split():
        if t < 1000:
            return fmt % (t, suff)
        t /= 1000
    return fmt % (t, "sec")

@accept_funcs
def measure(funcs, args=[list(WORDS)], comment=''):
    """Report how long `f(*args)` takes for each f in funcs."""
    w = max(len(name) for name, _ in funcs)
    for name, f in funcs:
        print("{:{}s} {:8s} {}".format(
                name, w, human_seconds(measure_func(f, args)), comment))

def combine_counts(items):
    d = defaultdict(int)
    for word, count in items:
        d[word] += count
    return d.iteritems()

def clean_words_in_items(clean_word, items):
    return ((clean_word(word), count) for word, count in items)

def normalize_count_words(words):
    """Normalize then count words."""
    return count_words_defaultdict(imap(toascii_letter_lower_translate, words))

def count_normalize_words(words):
    """Count then normalize words."""
    freqs = count_words_defaultdict(words)
    freqs = clean_words_in_items(toascii_letter_lower_translate, freqs)
    return sorted(combine_counts(freqs))

def main():
    # test
    doctest.testmod()
    count_words_funcs = get_functions_with_prefix('count_words_')
    toascii_funcs = get_functions_with_prefix('toascii_letter_lower_')
    funcs = [count_normalize_words, normalize_count_words]
    before_funcs = [('{} < map {}'.format(f.__name__, g.__name__),
                     compose(f, partial(imap, g)))
                    for f, g in product(count_words_funcs, toascii_funcs)]
    after_funcs = [('sorted < combine_counts < map {} < {}'.format(
                g.__name__, f.__name__),
                    compose(sorted, combine_counts,
                            partial(clean_words_in_items, g), f))
                   for f, g in product(count_words_funcs, toascii_funcs)]
    test_funcs(count_words_funcs)
    test_word_funcs(toascii_funcs)
    test_funcs(funcs)
    test_funcs(before_funcs + after_funcs)

    # measure performance
    for comment, s in zip(["small", "random 2000"],
                          ["ABC.,!-def", os.urandom(2000)]):
        measure(toascii_funcs, [s], comment)
        print("")
    measure([sorted] + count_words_funcs)
    print("")
    measure(funcs)
    print("")
    print("Convert to ascii-lowercase/remove non-alpha before counting words")
    measure(before_funcs)
    print("")
    print("Convert to ascii-lowercase/remove non-alpha after counting words")
    measure(after_funcs)

main()
	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""
	http://stackoverflow.com/q/13655169
	"""
	import doctest
	import inspect
	import os
	import sys
	import timeit
	from collections import Counter, defaultdict
	from functools import partial, wraps
	from itertools import groupby, imap, product
	from string import ascii_letters, ascii_lowercase, maketrans
	from timeit import default_timer as timer

	import nltk # pip install nltk

	WORDS = nltk.corpus.gutenberg.words()

	def count_words_Counter(words):
	return sorted(Counter(words).items())

	def count_words_groupby(words):
	return [(w, len(list(gr))) for w, gr in groupby(sorted(words))]

	def count_words_groupby_sum(words):
	return [(w, sum(1 for _ in gr)) for w, gr in groupby(sorted(words))]

	def count_words_defaultdict(words):
	d = defaultdict(int)
	for w in words:
	d[w] += 1
	return sorted(d.items())

	def count_words_dict(words):
	d = {}
	for w in words:
	try:
	d[w] += 1
	except KeyError:
	d[w] = 1
	return sorted(d.items())

	def _count_words_freqdist(words):
	# note: .items() returns words sorted by word frequency (descreasing order)
	# (same as `Counter.most_common()`)
	# so the code sorts twice (the second time in lexicographical order)
	return sorted(nltk.FreqDist(words).items())

	def toascii_letter_lower_genexpr(s, _letter_set=ascii_lowercase):
	"""
	>>> toascii_letter_lower_genexpr("ABC,-.!def")
	'abcdef'
	"""
	return ''.join(c for c in s.lower() if c in _letter_set)

	def toascii_letter_lower_genexpr_set(s, _letter_set=set(ascii_lowercase)):
	return ''.join(c for c in s.lower() if c in _letter_set)

	def toascii_letter_lower_translate(s,
	table=maketrans(ascii_letters, ascii_lowercase * 2),
	deletechars=''.join(set(maketrans('', '')) - set(ascii_letters))):
	return s.translate(table, deletechars)

	def toascii_letter_lower_filter(s, _letter_set=set(ascii_letters)):
	return filter(_letter_set.__contains__, s).lower()

	def compose(*funcs):
	return lambda arg: reduce(lambda x, g: g(x), reversed(funcs), arg)

	def get_functions_with_prefix(prefix):
	all_funcs = inspect.getmembers(sys.modules[__name__], inspect.isfunction)
	return [f for name, f in all_funcs if name.startswith(prefix)]

	def accept_funcs(func):
	"""Add function names if necessary.

	Decorator for functions that accept either a sequence of functions
	or (name, function) pairs.
	"""
	@wraps(func)
	def wrapper(funcs, args, *kwargs):
	if hasattr(funcs[0], '__name__'):
	funcs = [(f.__name__, f) for f in funcs]
	return func(funcs, args, *kwargs)
	return wrapper

	@accept_funcs
	def test_funcs(funcs):
	"""Check that all functions produce the same result.

	Each function should accept a readonly sequence of words as an argument
	"""
	words = list(WORDS[:10])
	expected = funcs[0][1](words)

	for name, f in funcs:
	assert f(words) == expected, (name, words, expected)

	@accept_funcs
	def test_word_funcs(funcs):
	"""Check that all functions produce the same result.

	Each function should accept a word as an argument
	"""
	for name, f in funcs[1:]:
	for w in WORDS[:1000]:
	assert f(w) == funcs[0][1](w), (name, w, funcs[0][1](w))

	def measure_func(f, args):
	"""Measure how long `f(*args)` takes in seconds."""
	n = 1
	while True:
	start = timer()
	r = timeit.repeat(partial(f, *args), number=n, repeat=1)
	if timer() - start > 1: # at least 1 second per measurement
	break
	n *= 2
	return min(r + timeit.repeat(partial(f, *args), number=n, repeat=2)) / n

	def human_seconds(seconds, fmt="%.3g %s"):
	"""Return human-readable string that represents given seconds."""
	t = 1e6 * seconds # start with µsec
	for suff in "usec msec".split():
	if t < 1000:
	return fmt % (t, suff)
	t /= 1000
	return fmt % (t, "sec")

	@accept_funcs
	def measure(funcs, args=[list(WORDS)], comment=''):
	"""Report how long `f(*args)` takes for each f in funcs."""
	w = max(len(name) for name, _ in funcs)
	for name, f in funcs:
	print("{:{}s} {:8s} {}".format(
	name, w, human_seconds(measure_func(f, args)), comment))

	def combine_counts(items):
	d = defaultdict(int)
	for word, count in items:
	d[word] += count
	return d.iteritems()

	def clean_words_in_items(clean_word, items):
	return ((clean_word(word), count) for word, count in items)

	def normalize_count_words(words):
	"""Normalize then count words."""
	return count_words_defaultdict(imap(toascii_letter_lower_translate, words))

	def count_normalize_words(words):
	"""Count then normalize words."""
	freqs = count_words_defaultdict(words)
	freqs = clean_words_in_items(toascii_letter_lower_translate, freqs)
	return sorted(combine_counts(freqs))

	def main():
	# test
	doctest.testmod()
	count_words_funcs = get_functions_with_prefix('count_words_')
	toascii_funcs = get_functions_with_prefix('toascii_letter_lower_')
	funcs = [count_normalize_words, normalize_count_words]
	before_funcs = [('{} < map {}'.format(f.__name__, g.__name__),
	compose(f, partial(imap, g)))
	for f, g in product(count_words_funcs, toascii_funcs)]
	after_funcs = [('sorted < combine_counts < map {} < {}'.format(
	g.__name__, f.__name__),
	compose(sorted, combine_counts,
	partial(clean_words_in_items, g), f))
	for f, g in product(count_words_funcs, toascii_funcs)]
	test_funcs(count_words_funcs)
	test_word_funcs(toascii_funcs)
	test_funcs(funcs)
	test_funcs(before_funcs + after_funcs)

	# measure performance
	for comment, s in zip(["small", "random 2000"],
	["ABC.,!-def", os.urandom(2000)]):
	measure(toascii_funcs, [s], comment)
	print("")
	measure([sorted] + count_words_funcs)
	print("")
	measure(funcs)
	print("")
	print("Convert to ascii-lowercase/remove non-alpha before counting words")
	measure(before_funcs)
	print("")
	print("Convert to ascii-lowercase/remove non-alpha after counting words")
	measure(after_funcs)

	main()