zed/.gitignore

## .gitignore
*.py[co]
/cachegrind.out.profilestats
/profilestats.prof

## count_char.py
#!/usr/bin/env python
"""count_char.py - profile performance of countchars() functions.

Example:

    $ time -p python count_char.py big.txt -f all -p

Results in `cachegrind.out.profilestats` (KCachegrind-compatible
format) and `profilestats.prof` (cProfile format)

Type `count_char.py --help` to see available options.
"""
from __future__ import print_function
from warnings import warn
try:
    import pyximport; pyximport.install() # pip install cython
    from cychar_count import countchars_cython
except ImportError:
    import sys
    warn(
        "warning: can't import pyximport, countchars_cython will be unavailable, try"
        "\npip install cython", ImportWarning)

try: reduce = reduce
except NameError:
    from functools import reduce

try: unichr = unichr
except NameError:
    unichr = chr

try:
    _, bytes = unicode, str # python 2.x
except NameError:
    pass

import codecs, collections, itertools, fileinput, operator, sys, textwrap
from optparse import OptionParser

try:
    from profilestats import profile # pip install profilestats
except ImportError:
    warn(
        "warning: can't import profilestats, profiling will be unavailable, try"
        "\npip install profilestats", ImportWarning)

try:
    import numpy
    import sys
    def countchars_numpy(chars):
        if isinstance(chars, bytes):
            chars = chars.decode("utf-8")

        # count ordinals
        #NOTE: `str` in py3k doesn't have buffer interface
        enc = 'utf-16' + ('le' if sys.byteorder == 'little' else 'be')
        a = numpy.frombuffer(chars.encode(enc), dtype=numpy.uint16)
        counts = numpy.bincount(a)

        counts = [(unichr(i), v) for i, v in enumerate(counts) if v]
        return dict(counts)
except ImportError:
    warn("warning: can't import numpy, countchars_numpy() will be unavailable"
        ", try\nsudo apt-get install python-numpy", ImportWarning)

try:
    from smark import countchars as _countchars_smark
    def countchars_smark(chars):
        if isinstance(chars, bytes):
            chars = chars.decode('utf-8') # try to decode
        return dict(_countchars_smark(chars))
except ImportError:
    warn(
        "warning: can't import smark, countchars_smark() will be unavailable",
        ImportWarning)

def getchars(filename):
    return codecs.open(filename, encoding='utf-8').read()

def countchars_Counter(chars):
    return collections.Counter(chars)

def countchars_defaultdict(chars):
    d = collections.defaultdict(int)
    for ch in chars:
        d[ch] += 1
    return dict(d)

def countchars_list(chars, ord=ord):
    d = [0]*256
    for ch in chars:
        d[ord(ch)] += 1
    return dict((unichr(i), v) for i, v in enumerate(d) if v)

def countchars_dict(chars):
    d = dict()
    for ch in chars:
        d[ch] = d.setdefault(ch, 0) + 1
    return d

def countchars_dict_try_catch(chars):
    d = dict()
    for ch in chars:
        try:  d[ch] += 1
        except KeyError:
            d[ch] = 1
    return d

def printcounts(counts):
    s = ' '.join('(%s %d)' % (repr(c.encode('utf-8')) if c in ' \t\n' else c, n)
                 for c, n in counts)
    print(textwrap.fill(s.encode('utf-8'), width=79))

def eval_dottedname(dottedname):
    """
    >>> eval_dottedname("os.path.join") #doctest: +ELLIPSIS
    <function join at 0x...>
    >>> eval_dottedname("sys.exit") #doctest: +ELLIPSIS
    <built-in function exit>
    >>> eval_dottedname("sys") #doctest: +ELLIPSIS
    <module 'sys' (built-in)>
    """
    return reduce(getattr, dottedname.split(".")[1:],
                  __import__(dottedname.partition(".")[0]))

def sortcounts(counts):
    if hasattr(counts, 'items'):
        counts = counts.items()
    return sorted(counts, key=lambda x: (x[1], x[0]), reverse=True)

normalize = sortcounts

def run(filename, countchars):
    printcounts(normalize(countchars(getchars(filename))))

def run_all(filename):
    import count_char
    funcs = (getattr(count_char, n) for n in dir(count_char) if n.startswith('countchars_'))

    chars = getchars(filename)
    d = {}
    for f in funcs:
        try:
            d[f.__name__] = normalize(f(chars))
        except:
            print('_'*79, file=sys.stderr)
            print('%s():' % f.__name__, file=sys.stderr)
            import traceback
            traceback.print_exc(file=sys.stderr)
            print('-'*79, file=sys.stderr)

    for name, result in d.items():
        for name2, result2 in d.items():
            if name == name2: continue
            assert result == result2, ((name, name2), (result, result2))
    printcounts(result)

def main(argv):
    parser = OptionParser()
    parser.add_option('-p', '--profile', action="store_true", default=False,
                      help="enable profiler")
    parser.add_option('-f', '--function', default="count_char.countchars_defaultdict",
                      help="specify which countchars function to use or 'all'")
    options, args = parser.parse_args(argv)

    fargs = [args[1] if len(args) > 1 else '/usr/share/dict/american-english']
    if options.function == 'all':
        f = run_all
    else:
        f = run
        fargs += [eval_dottedname(options.function)]

    if options.profile:
        f = profile(f)

    return f(*fargs)


if __name__=="__main__":
    sys.exit(main(sys.argv))

## cychar_count.pyx
#cython: language_level=2, boundscheck=False, wraparound=False
import cython

try:
    from __builtin__ import unichr
except ImportError:
    unichr = chr # py3k

@cython.locals(chars=bytes, c=cython.Py_ssize_t, L=cython.Py_ssize_t[0x100])
def countchars_cython_bytes(chars):
    for c in range(0x100):
        L[c] = 0

    for c in chars: # translates into an efficient loop in Cython 0.13+
        L[c] += 1

    return {unichr(c): L[c] for c in range(0x100) if L[c]}

@cython.locals(chars=unicode, i=cython.Py_ssize_t, L=cython.Py_ssize_t[0x10000])
def countchars_cython(chars):
    for i in range(0x10000):
        L[i] = 0

    for c in chars:
        L[c] += 1

    return {unichr(i): L[i] for i in range(0x10000) if L[i]}

## setup.py
from distutils.core import setup, Extension

setup (name = 'smark',
       version = '0.2',
       description = 'count characters in C',
       ext_modules = [Extension('smark', sources = ['smarkmodule.c'])])


## smarkmodule.c
/**
   See http://stackoverflow.com/questions/2522152/python-is-a-dictionary-slow-to-find-frequency-of-each-character/2532564#2532564
 */
#include <Python.h>


static PyObject *
CharCounter(PyObject *self, PyObject *args)
{
    wchar_t *t1;unsigned l1=0;

    static unsigned* char_counter = NULL;
    static wchar_t* char_list = NULL;
    if (!char_counter && !(char_counter = (unsigned*)malloc(sizeof(unsigned)*0x10000))) goto memory_error;
    if (!char_list && !(char_list = (wchar_t*)malloc(sizeof(wchar_t)*0x10000))) goto memory_error;

    if (!PyArg_ParseTuple(args,"u#",&t1,&l1)) return NULL;

    PyObject *resultList,*itemTuple;

    unsigned i = -1;
    for(i=0;i<=0xffff;i++)char_counter[i]=0;

    unsigned chlen=0;

    for(i=0;i<l1;i++){
        if(char_counter[t1[i]]==0)char_list[chlen++]=t1[i];
        char_counter[t1[i]]++;
    }

    resultList = PyList_New(0);
    if (!resultList) return NULL;

    for(i=0;i<chlen;i++){
        itemTuple = PyTuple_New(2);
        if (!itemTuple) return NULL;

        PyTuple_SetItem(itemTuple, 0,PyUnicode_FromWideChar(&char_list[i],1));
        PyTuple_SetItem(itemTuple, 1,PyInt_FromLong(char_counter[char_list[i]]));
        PyList_Append(resultList, itemTuple);
        Py_DECREF(itemTuple);
    };

    return resultList;
 memory_error:
    /*malloc() failed */
    PyErr_NoMemory();
    return NULL;
}

static PyMethodDef SmarkMethods[] = {
  {"countchars",  (PyCFunction)CharCounter, METH_VARARGS,
   "Count character frequencies in a given unicode text.\n"
   "Note: unicode characters with ordinals larger than 0xFFFF are unsupported"},
  {NULL, NULL, 0, NULL}        /* Sentinel */
};


PyMODINIT_FUNC
initsmark(void)
{
  (void) Py_InitModule("smark", SmarkMethods);
}
	#!/usr/bin/env python
	"""count_char.py - profile performance of countchars() functions.

	Example:

	$ time -p python count_char.py big.txt -f all -p

	Results in `cachegrind.out.profilestats` (KCachegrind-compatible
	format) and `profilestats.prof` (cProfile format)

	Type `count_char.py --help` to see available options.
	"""
	from __future__ import print_function
	from warnings import warn
	try:
	import pyximport; pyximport.install() # pip install cython
	from cychar_count import countchars_cython
	except ImportError:
	import sys
	warn(
	"warning: can't import pyximport, countchars_cython will be unavailable, try"
	"\npip install cython", ImportWarning)

	try: reduce = reduce
	except NameError:
	from functools import reduce

	try: unichr = unichr
	except NameError:
	unichr = chr

	try:
	_, bytes = unicode, str # python 2.x
	except NameError:
	pass

	import codecs, collections, itertools, fileinput, operator, sys, textwrap
	from optparse import OptionParser

	try:
	from profilestats import profile # pip install profilestats
	except ImportError:
	warn(
	"warning: can't import profilestats, profiling will be unavailable, try"
	"\npip install profilestats", ImportWarning)

	try:
	import numpy
	import sys
	def countchars_numpy(chars):
	if isinstance(chars, bytes):
	chars = chars.decode("utf-8")

	# count ordinals
	#NOTE: `str` in py3k doesn't have buffer interface
	enc = 'utf-16' + ('le' if sys.byteorder == 'little' else 'be')
	a = numpy.frombuffer(chars.encode(enc), dtype=numpy.uint16)
	counts = numpy.bincount(a)

	counts = [(unichr(i), v) for i, v in enumerate(counts) if v]
	return dict(counts)
	except ImportError:
	warn("warning: can't import numpy, countchars_numpy() will be unavailable"
	", try\nsudo apt-get install python-numpy", ImportWarning)

	try:
	from smark import countchars as _countchars_smark
	def countchars_smark(chars):
	if isinstance(chars, bytes):
	chars = chars.decode('utf-8') # try to decode
	return dict(_countchars_smark(chars))
	except ImportError:
	warn(
	"warning: can't import smark, countchars_smark() will be unavailable",
	ImportWarning)

	def getchars(filename):
	return codecs.open(filename, encoding='utf-8').read()

	def countchars_Counter(chars):
	return collections.Counter(chars)

	def countchars_defaultdict(chars):
	d = collections.defaultdict(int)
	for ch in chars:
	d[ch] += 1
	return dict(d)

	def countchars_list(chars, ord=ord):
	d = [0]*256
	for ch in chars:
	d[ord(ch)] += 1
	return dict((unichr(i), v) for i, v in enumerate(d) if v)

	def countchars_dict(chars):
	d = dict()
	for ch in chars:
	d[ch] = d.setdefault(ch, 0) + 1
	return d

	def countchars_dict_try_catch(chars):
	d = dict()
	for ch in chars:
	try: d[ch] += 1
	except KeyError:
	d[ch] = 1
	return d

	def printcounts(counts):
	s = ' '.join('(%s %d)' % (repr(c.encode('utf-8')) if c in ' \t\n' else c, n)
	for c, n in counts)
	print(textwrap.fill(s.encode('utf-8'), width=79))

	def eval_dottedname(dottedname):
	"""
	>>> eval_dottedname("os.path.join") #doctest: +ELLIPSIS
	<function join at 0x...>
	>>> eval_dottedname("sys.exit") #doctest: +ELLIPSIS
	<built-in function exit>
	>>> eval_dottedname("sys") #doctest: +ELLIPSIS
	<module 'sys' (built-in)>
	"""
	return reduce(getattr, dottedname.split(".")[1:],
	__import__(dottedname.partition(".")[0]))

	def sortcounts(counts):
	if hasattr(counts, 'items'):
	counts = counts.items()
	return sorted(counts, key=lambda x: (x[1], x[0]), reverse=True)

	normalize = sortcounts

	def run(filename, countchars):
	printcounts(normalize(countchars(getchars(filename))))

	def run_all(filename):
	import count_char
	funcs = (getattr(count_char, n) for n in dir(count_char) if n.startswith('countchars_'))

	chars = getchars(filename)
	d = {}
	for f in funcs:
	try:
	d[f.__name__] = normalize(f(chars))
	except:
	print('_'*79, file=sys.stderr)
	print('%s():' % f.__name__, file=sys.stderr)
	import traceback
	traceback.print_exc(file=sys.stderr)
	print('-'*79, file=sys.stderr)

	for name, result in d.items():
	for name2, result2 in d.items():
	if name == name2: continue
	assert result == result2, ((name, name2), (result, result2))
	printcounts(result)

	def main(argv):
	parser = OptionParser()
	parser.add_option('-p', '--profile', action="store_true", default=False,
	help="enable profiler")
	parser.add_option('-f', '--function', default="count_char.countchars_defaultdict",
	help="specify which countchars function to use or 'all'")
	options, args = parser.parse_args(argv)

	fargs = [args[1] if len(args) > 1 else '/usr/share/dict/american-english']
	if options.function == 'all':
	f = run_all
	else:
	f = run
	fargs += [eval_dottedname(options.function)]

	if options.profile:
	f = profile(f)

	return f(*fargs)


	if __name__=="__main__":
	sys.exit(main(sys.argv))
	#cython: language_level=2, boundscheck=False, wraparound=False
	import cython

	try:
	from __builtin__ import unichr
	except ImportError:
	unichr = chr # py3k

	@cython.locals(chars=bytes, c=cython.Py_ssize_t, L=cython.Py_ssize_t[0x100])
	def countchars_cython_bytes(chars):
	for c in range(0x100):
	L[c] = 0

	for c in chars: # translates into an efficient loop in Cython 0.13+
	L[c] += 1

	return {unichr(c): L[c] for c in range(0x100) if L[c]}

	@cython.locals(chars=unicode, i=cython.Py_ssize_t, L=cython.Py_ssize_t[0x10000])
	def countchars_cython(chars):
	for i in range(0x10000):
	L[i] = 0

	for c in chars:
	L[c] += 1

	return {unichr(i): L[i] for i in range(0x10000) if L[i]}
	from distutils.core import setup, Extension

	setup (name = 'smark',
	version = '0.2',
	description = 'count characters in C',
	ext_modules = [Extension('smark', sources = ['smarkmodule.c'])])
	/**
	See http://stackoverflow.com/questions/2522152/python-is-a-dictionary-slow-to-find-frequency-of-each-character/2532564#2532564
	*/
	#include <Python.h>


	static PyObject *
	CharCounter(PyObject self, PyObject args)
	{
	wchar_t *t1;unsigned l1=0;

	static unsigned* char_counter = NULL;
	static wchar_t* char_list = NULL;
	if (!char_counter && !(char_counter = (unsigned)malloc(sizeof(unsigned)0x10000))) goto memory_error;
	if (!char_list && !(char_list = (wchar_t)malloc(sizeof(wchar_t)0x10000))) goto memory_error;

	if (!PyArg_ParseTuple(args,"u#",&t1,&l1)) return NULL;

	PyObject resultList,itemTuple;

	unsigned i = -1;
	for(i=0;i<=0xffff;i++)char_counter[i]=0;

	unsigned chlen=0;

	for(i=0;i<l1;i++){
	if(char_counter[t1[i]]==0)char_list[chlen++]=t1[i];
	char_counter[t1[i]]++;
	}

	resultList = PyList_New(0);
	if (!resultList) return NULL;

	for(i=0;i<chlen;i++){
	itemTuple = PyTuple_New(2);
	if (!itemTuple) return NULL;

	PyTuple_SetItem(itemTuple, 0,PyUnicode_FromWideChar(&char_list[i],1));
	PyTuple_SetItem(itemTuple, 1,PyInt_FromLong(char_counter[char_list[i]]));
	PyList_Append(resultList, itemTuple);
	Py_DECREF(itemTuple);
	};

	return resultList;
	memory_error:
	/malloc() failed /
	PyErr_NoMemory();
	return NULL;
	}

	static PyMethodDef SmarkMethods[] = {
	{"countchars", (PyCFunction)CharCounter, METH_VARARGS,
	"Count character frequencies in a given unicode text.\n"
	"Note: unicode characters with ordinals larger than 0xFFFF are unsupported"},
	{NULL, NULL, 0, NULL} /* Sentinel */
	};


	PyMODINIT_FUNC
	initsmark(void)
	{
	(void) Py_InitModule("smark", SmarkMethods);
	}