zed/count-words-performance.py

## count-words-performance.py
#!/usr/bin/env python
"""
http://stackoverflow.com/q/13655169
"""
import inspect
import sys
import timeit
from collections import Counter, defaultdict
from functools import partial
from itertools import groupby

import nltk  # pip install nltk

WORDS = nltk.corpus.gutenberg.words()

def count_words_Counter(words):
    return sorted(Counter(words).items())

def count_words_groupby(words):
    return [(w, len(list(gr))) for w, gr in groupby(sorted(words))]

def count_words_groupby_sum(words):
    return [(w, sum(1 for _ in gr)) for w, gr in groupby(sorted(words))]

def count_words_defaultdict(words):
    d = defaultdict(int)
    for w in words:
        d[w] += 1
    return sorted(d.items())

def count_words_dict(words):
    d = {}
    for w in words:
        try:
            d[w] += 1
        except KeyError:
            d[w] = 1
    return sorted(d.items())

def count_words_freqdist(words):
    # note: .items() returns words sorted by word frequency (descreasing order)
    #       (same as `Counter.most_common()`)
    #       so the code sorts twice (the second time in alphabetical order)
    return sorted(nltk.FreqDist(words).items())

def get_count_words_functions():
    all_funcs = inspect.getmembers(sys.modules[__name__], inspect.isfunction)
    return [f for name, f in all_funcs if name.startswith('count_words_')]

def test_func(f, words, expected):
    assert f(words) == expected

def test():
    funcs = get_count_words_functions()
    words = list(WORDS[:1000])
    expected = funcs[0](words)

    for f in funcs:
        test_func(f, words, expected)

def measure_func(f, words):
    return min(timeit.repeat(partial(f, words), number=1))

def measure():
    funcs = get_count_words_functions()
    words = list(WORDS)
    w = max(len(f.__name__) for f in funcs)
    for f in [sorted] + funcs:
        print("{:{}s} {:.2f}".format(f.__name__, w, measure_func(f, words)))

test()
measure()
	#!/usr/bin/env python
	"""
	http://stackoverflow.com/q/13655169
	"""
	import inspect
	import sys
	import timeit
	from collections import Counter, defaultdict
	from functools import partial
	from itertools import groupby

	import nltk # pip install nltk

	WORDS = nltk.corpus.gutenberg.words()

	def count_words_Counter(words):
	return sorted(Counter(words).items())

	def count_words_groupby(words):
	return [(w, len(list(gr))) for w, gr in groupby(sorted(words))]

	def count_words_groupby_sum(words):
	return [(w, sum(1 for _ in gr)) for w, gr in groupby(sorted(words))]

	def count_words_defaultdict(words):
	d = defaultdict(int)
	for w in words:
	d[w] += 1
	return sorted(d.items())

	def count_words_dict(words):
	d = {}
	for w in words:
	try:
	d[w] += 1
	except KeyError:
	d[w] = 1
	return sorted(d.items())

	def count_words_freqdist(words):
	# note: .items() returns words sorted by word frequency (descreasing order)
	# (same as `Counter.most_common()`)
	# so the code sorts twice (the second time in alphabetical order)
	return sorted(nltk.FreqDist(words).items())

	def get_count_words_functions():
	all_funcs = inspect.getmembers(sys.modules[__name__], inspect.isfunction)
	return [f for name, f in all_funcs if name.startswith('count_words_')]

	def test_func(f, words, expected):
	assert f(words) == expected

	def test():
	funcs = get_count_words_functions()
	words = list(WORDS[:1000])
	expected = funcs[0](words)

	for f in funcs:
	test_func(f, words, expected)

	def measure_func(f, words):
	return min(timeit.repeat(partial(f, words), number=1))

	def measure():
	funcs = get_count_words_functions()
	words = list(WORDS)
	w = max(len(f.__name__) for f in funcs)
	for f in [sorted] + funcs:
	print("{:{}s} {:.2f}".format(f.__name__, w, measure_func(f, words)))

	test()
	measure()