Created
March 28, 2010 20:11
-
-
Save zed/347000 to your computer and use it in GitHub Desktop.
profile performance of countchars() functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.py[co] | |
/cachegrind.out.profilestats | |
/profilestats.prof |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""count_char.py - profile performance of countchars() functions. | |
Example: | |
$ time -p python count_char.py big.txt -f all -p | |
Results in `cachegrind.out.profilestats` (KCachegrind-compatible | |
format) and `profilestats.prof` (cProfile format) | |
Type `count_char.py --help` to see available options. | |
""" | |
from __future__ import print_function | |
from warnings import warn | |
try: | |
import pyximport; pyximport.install() # pip install cython | |
from cychar_count import countchars_cython | |
except ImportError: | |
import sys | |
warn( | |
"warning: can't import pyximport, countchars_cython will be unavailable, try" | |
"\npip install cython", ImportWarning) | |
try: reduce = reduce | |
except NameError: | |
from functools import reduce | |
try: unichr = unichr | |
except NameError: | |
unichr = chr | |
try: | |
_, bytes = unicode, str # python 2.x | |
except NameError: | |
pass | |
import codecs, collections, itertools, fileinput, operator, sys, textwrap | |
from optparse import OptionParser | |
try: | |
from profilestats import profile # pip install profilestats | |
except ImportError: | |
warn( | |
"warning: can't import profilestats, profiling will be unavailable, try" | |
"\npip install profilestats", ImportWarning) | |
try: | |
import numpy | |
import sys | |
def countchars_numpy(chars): | |
if isinstance(chars, bytes): | |
chars = chars.decode("utf-8") | |
# count ordinals | |
#NOTE: `str` in py3k doesn't have buffer interface | |
enc = 'utf-16' + ('le' if sys.byteorder == 'little' else 'be') | |
a = numpy.frombuffer(chars.encode(enc), dtype=numpy.uint16) | |
counts = numpy.bincount(a) | |
counts = [(unichr(i), v) for i, v in enumerate(counts) if v] | |
return dict(counts) | |
except ImportError: | |
warn("warning: can't import numpy, countchars_numpy() will be unavailable" | |
", try\nsudo apt-get install python-numpy", ImportWarning) | |
try: | |
from smark import countchars as _countchars_smark | |
def countchars_smark(chars): | |
if isinstance(chars, bytes): | |
chars = chars.decode('utf-8') # try to decode | |
return dict(_countchars_smark(chars)) | |
except ImportError: | |
warn( | |
"warning: can't import smark, countchars_smark() will be unavailable", | |
ImportWarning) | |
def getchars(filename): | |
return codecs.open(filename, encoding='utf-8').read() | |
def countchars_Counter(chars): | |
return collections.Counter(chars) | |
def countchars_defaultdict(chars): | |
d = collections.defaultdict(int) | |
for ch in chars: | |
d[ch] += 1 | |
return dict(d) | |
def countchars_list(chars, ord=ord): | |
d = [0]*256 | |
for ch in chars: | |
d[ord(ch)] += 1 | |
return dict((unichr(i), v) for i, v in enumerate(d) if v) | |
def countchars_dict(chars): | |
d = dict() | |
for ch in chars: | |
d[ch] = d.setdefault(ch, 0) + 1 | |
return d | |
def countchars_dict_try_catch(chars): | |
d = dict() | |
for ch in chars: | |
try: d[ch] += 1 | |
except KeyError: | |
d[ch] = 1 | |
return d | |
def printcounts(counts): | |
s = ' '.join('(%s %d)' % (repr(c.encode('utf-8')) if c in ' \t\n' else c, n) | |
for c, n in counts) | |
print(textwrap.fill(s.encode('utf-8'), width=79)) | |
def eval_dottedname(dottedname): | |
""" | |
>>> eval_dottedname("os.path.join") #doctest: +ELLIPSIS | |
<function join at 0x...> | |
>>> eval_dottedname("sys.exit") #doctest: +ELLIPSIS | |
<built-in function exit> | |
>>> eval_dottedname("sys") #doctest: +ELLIPSIS | |
<module 'sys' (built-in)> | |
""" | |
return reduce(getattr, dottedname.split(".")[1:], | |
__import__(dottedname.partition(".")[0])) | |
def sortcounts(counts): | |
if hasattr(counts, 'items'): | |
counts = counts.items() | |
return sorted(counts, key=lambda x: (x[1], x[0]), reverse=True) | |
normalize = sortcounts | |
def run(filename, countchars): | |
printcounts(normalize(countchars(getchars(filename)))) | |
def run_all(filename): | |
import count_char | |
funcs = (getattr(count_char, n) for n in dir(count_char) if n.startswith('countchars_')) | |
chars = getchars(filename) | |
d = {} | |
for f in funcs: | |
try: | |
d[f.__name__] = normalize(f(chars)) | |
except: | |
print('_'*79, file=sys.stderr) | |
print('%s():' % f.__name__, file=sys.stderr) | |
import traceback | |
traceback.print_exc(file=sys.stderr) | |
print('-'*79, file=sys.stderr) | |
for name, result in d.items(): | |
for name2, result2 in d.items(): | |
if name == name2: continue | |
assert result == result2, ((name, name2), (result, result2)) | |
printcounts(result) | |
def main(argv): | |
parser = OptionParser() | |
parser.add_option('-p', '--profile', action="store_true", default=False, | |
help="enable profiler") | |
parser.add_option('-f', '--function', default="count_char.countchars_defaultdict", | |
help="specify which countchars function to use or 'all'") | |
options, args = parser.parse_args(argv) | |
fargs = [args[1] if len(args) > 1 else '/usr/share/dict/american-english'] | |
if options.function == 'all': | |
f = run_all | |
else: | |
f = run | |
fargs += [eval_dottedname(options.function)] | |
if options.profile: | |
f = profile(f) | |
return f(*fargs) | |
if __name__=="__main__": | |
sys.exit(main(sys.argv)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#cython: language_level=2, boundscheck=False, wraparound=False | |
import cython | |
try: | |
from __builtin__ import unichr | |
except ImportError: | |
unichr = chr # py3k | |
@cython.locals(chars=bytes, c=cython.Py_ssize_t, L=cython.Py_ssize_t[0x100]) | |
def countchars_cython_bytes(chars): | |
for c in range(0x100): | |
L[c] = 0 | |
for c in chars: # translates into an efficient loop in Cython 0.13+ | |
L[c] += 1 | |
return {unichr(c): L[c] for c in range(0x100) if L[c]} | |
@cython.locals(chars=unicode, i=cython.Py_ssize_t, L=cython.Py_ssize_t[0x10000]) | |
def countchars_cython(chars): | |
for i in range(0x10000): | |
L[i] = 0 | |
for c in chars: | |
L[c] += 1 | |
return {unichr(i): L[i] for i in range(0x10000) if L[i]} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from distutils.core import setup, Extension | |
setup (name = 'smark', | |
version = '0.2', | |
description = 'count characters in C', | |
ext_modules = [Extension('smark', sources = ['smarkmodule.c'])]) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
See http://stackoverflow.com/questions/2522152/python-is-a-dictionary-slow-to-find-frequency-of-each-character/2532564#2532564 | |
*/ | |
#include <Python.h> | |
static PyObject * | |
CharCounter(PyObject *self, PyObject *args) | |
{ | |
wchar_t *t1;unsigned l1=0; | |
static unsigned* char_counter = NULL; | |
static wchar_t* char_list = NULL; | |
if (!char_counter && !(char_counter = (unsigned*)malloc(sizeof(unsigned)*0x10000))) goto memory_error; | |
if (!char_list && !(char_list = (wchar_t*)malloc(sizeof(wchar_t)*0x10000))) goto memory_error; | |
if (!PyArg_ParseTuple(args,"u#",&t1,&l1)) return NULL; | |
PyObject *resultList,*itemTuple; | |
unsigned i = -1; | |
for(i=0;i<=0xffff;i++)char_counter[i]=0; | |
unsigned chlen=0; | |
for(i=0;i<l1;i++){ | |
if(char_counter[t1[i]]==0)char_list[chlen++]=t1[i]; | |
char_counter[t1[i]]++; | |
} | |
resultList = PyList_New(0); | |
if (!resultList) return NULL; | |
for(i=0;i<chlen;i++){ | |
itemTuple = PyTuple_New(2); | |
if (!itemTuple) return NULL; | |
PyTuple_SetItem(itemTuple, 0,PyUnicode_FromWideChar(&char_list[i],1)); | |
PyTuple_SetItem(itemTuple, 1,PyInt_FromLong(char_counter[char_list[i]])); | |
PyList_Append(resultList, itemTuple); | |
Py_DECREF(itemTuple); | |
}; | |
return resultList; | |
memory_error: | |
/*malloc() failed */ | |
PyErr_NoMemory(); | |
return NULL; | |
} | |
static PyMethodDef SmarkMethods[] = { | |
{"countchars", (PyCFunction)CharCounter, METH_VARARGS, | |
"Count character frequencies in a given unicode text.\n" | |
"Note: unicode characters with ordinals larger than 0xFFFF are unsupported"}, | |
{NULL, NULL, 0, NULL} /* Sentinel */ | |
}; | |
PyMODINIT_FUNC | |
initsmark(void) | |
{ | |
(void) Py_InitModule("smark", SmarkMethods); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment