Skip to content

Instantly share code, notes, and snippets.

@zed
Created March 28, 2010 20:11
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save zed/347000 to your computer and use it in GitHub Desktop.
Save zed/347000 to your computer and use it in GitHub Desktop.
profile performance of countchars() functions
*.py[co]
/cachegrind.out.profilestats
/profilestats.prof
#!/usr/bin/env python
"""count_char.py - profile performance of countchars() functions.
Example:
$ time -p python count_char.py big.txt -f all -p
Results in `cachegrind.out.profilestats` (KCachegrind-compatible
format) and `profilestats.prof` (cProfile format)
Type `count_char.py --help` to see available options.
"""
from __future__ import print_function
from warnings import warn
try:
import pyximport; pyximport.install() # pip install cython
from cychar_count import countchars_cython
except ImportError:
import sys
warn(
"warning: can't import pyximport, countchars_cython will be unavailable, try"
"\npip install cython", ImportWarning)
try: reduce = reduce
except NameError:
from functools import reduce
try: unichr = unichr
except NameError:
unichr = chr
try:
_, bytes = unicode, str # python 2.x
except NameError:
pass
import codecs, collections, itertools, fileinput, operator, sys, textwrap
from optparse import OptionParser
try:
from profilestats import profile # pip install profilestats
except ImportError:
warn(
"warning: can't import profilestats, profiling will be unavailable, try"
"\npip install profilestats", ImportWarning)
try:
import numpy
import sys
def countchars_numpy(chars):
if isinstance(chars, bytes):
chars = chars.decode("utf-8")
# count ordinals
#NOTE: `str` in py3k doesn't have buffer interface
enc = 'utf-16' + ('le' if sys.byteorder == 'little' else 'be')
a = numpy.frombuffer(chars.encode(enc), dtype=numpy.uint16)
counts = numpy.bincount(a)
counts = [(unichr(i), v) for i, v in enumerate(counts) if v]
return dict(counts)
except ImportError:
warn("warning: can't import numpy, countchars_numpy() will be unavailable"
", try\nsudo apt-get install python-numpy", ImportWarning)
try:
from smark import countchars as _countchars_smark
def countchars_smark(chars):
if isinstance(chars, bytes):
chars = chars.decode('utf-8') # try to decode
return dict(_countchars_smark(chars))
except ImportError:
warn(
"warning: can't import smark, countchars_smark() will be unavailable",
ImportWarning)
def getchars(filename):
return codecs.open(filename, encoding='utf-8').read()
def countchars_Counter(chars):
return collections.Counter(chars)
def countchars_defaultdict(chars):
d = collections.defaultdict(int)
for ch in chars:
d[ch] += 1
return dict(d)
def countchars_list(chars, ord=ord):
d = [0]*256
for ch in chars:
d[ord(ch)] += 1
return dict((unichr(i), v) for i, v in enumerate(d) if v)
def countchars_dict(chars):
d = dict()
for ch in chars:
d[ch] = d.setdefault(ch, 0) + 1
return d
def countchars_dict_try_catch(chars):
d = dict()
for ch in chars:
try: d[ch] += 1
except KeyError:
d[ch] = 1
return d
def printcounts(counts):
s = ' '.join('(%s %d)' % (repr(c.encode('utf-8')) if c in ' \t\n' else c, n)
for c, n in counts)
print(textwrap.fill(s.encode('utf-8'), width=79))
def eval_dottedname(dottedname):
"""
>>> eval_dottedname("os.path.join") #doctest: +ELLIPSIS
<function join at 0x...>
>>> eval_dottedname("sys.exit") #doctest: +ELLIPSIS
<built-in function exit>
>>> eval_dottedname("sys") #doctest: +ELLIPSIS
<module 'sys' (built-in)>
"""
return reduce(getattr, dottedname.split(".")[1:],
__import__(dottedname.partition(".")[0]))
def sortcounts(counts):
if hasattr(counts, 'items'):
counts = counts.items()
return sorted(counts, key=lambda x: (x[1], x[0]), reverse=True)
normalize = sortcounts
def run(filename, countchars):
printcounts(normalize(countchars(getchars(filename))))
def run_all(filename):
import count_char
funcs = (getattr(count_char, n) for n in dir(count_char) if n.startswith('countchars_'))
chars = getchars(filename)
d = {}
for f in funcs:
try:
d[f.__name__] = normalize(f(chars))
except:
print('_'*79, file=sys.stderr)
print('%s():' % f.__name__, file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
print('-'*79, file=sys.stderr)
for name, result in d.items():
for name2, result2 in d.items():
if name == name2: continue
assert result == result2, ((name, name2), (result, result2))
printcounts(result)
def main(argv):
parser = OptionParser()
parser.add_option('-p', '--profile', action="store_true", default=False,
help="enable profiler")
parser.add_option('-f', '--function', default="count_char.countchars_defaultdict",
help="specify which countchars function to use or 'all'")
options, args = parser.parse_args(argv)
fargs = [args[1] if len(args) > 1 else '/usr/share/dict/american-english']
if options.function == 'all':
f = run_all
else:
f = run
fargs += [eval_dottedname(options.function)]
if options.profile:
f = profile(f)
return f(*fargs)
if __name__=="__main__":
sys.exit(main(sys.argv))
#cython: language_level=2, boundscheck=False, wraparound=False
import cython
try:
from __builtin__ import unichr
except ImportError:
unichr = chr # py3k
@cython.locals(chars=bytes, c=cython.Py_ssize_t, L=cython.Py_ssize_t[0x100])
def countchars_cython_bytes(chars):
for c in range(0x100):
L[c] = 0
for c in chars: # translates into an efficient loop in Cython 0.13+
L[c] += 1
return {unichr(c): L[c] for c in range(0x100) if L[c]}
@cython.locals(chars=unicode, i=cython.Py_ssize_t, L=cython.Py_ssize_t[0x10000])
def countchars_cython(chars):
for i in range(0x10000):
L[i] = 0
for c in chars:
L[c] += 1
return {unichr(i): L[i] for i in range(0x10000) if L[i]}
from distutils.core import setup, Extension
setup (name = 'smark',
version = '0.2',
description = 'count characters in C',
ext_modules = [Extension('smark', sources = ['smarkmodule.c'])])
/**
See http://stackoverflow.com/questions/2522152/python-is-a-dictionary-slow-to-find-frequency-of-each-character/2532564#2532564
*/
#include <Python.h>
static PyObject *
CharCounter(PyObject *self, PyObject *args)
{
wchar_t *t1;unsigned l1=0;
static unsigned* char_counter = NULL;
static wchar_t* char_list = NULL;
if (!char_counter && !(char_counter = (unsigned*)malloc(sizeof(unsigned)*0x10000))) goto memory_error;
if (!char_list && !(char_list = (wchar_t*)malloc(sizeof(wchar_t)*0x10000))) goto memory_error;
if (!PyArg_ParseTuple(args,"u#",&t1,&l1)) return NULL;
PyObject *resultList,*itemTuple;
unsigned i = -1;
for(i=0;i<=0xffff;i++)char_counter[i]=0;
unsigned chlen=0;
for(i=0;i<l1;i++){
if(char_counter[t1[i]]==0)char_list[chlen++]=t1[i];
char_counter[t1[i]]++;
}
resultList = PyList_New(0);
if (!resultList) return NULL;
for(i=0;i<chlen;i++){
itemTuple = PyTuple_New(2);
if (!itemTuple) return NULL;
PyTuple_SetItem(itemTuple, 0,PyUnicode_FromWideChar(&char_list[i],1));
PyTuple_SetItem(itemTuple, 1,PyInt_FromLong(char_counter[char_list[i]]));
PyList_Append(resultList, itemTuple);
Py_DECREF(itemTuple);
};
return resultList;
memory_error:
/*malloc() failed */
PyErr_NoMemory();
return NULL;
}
static PyMethodDef SmarkMethods[] = {
{"countchars", (PyCFunction)CharCounter, METH_VARARGS,
"Count character frequencies in a given unicode text.\n"
"Note: unicode characters with ordinals larger than 0xFFFF are unsupported"},
{NULL, NULL, 0, NULL} /* Sentinel */
};
PyMODINIT_FUNC
initsmark(void)
{
(void) Py_InitModule("smark", SmarkMethods);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment