zed/make-figures.py

## make-figures.py
#!/usr/bin/env python2
"""Generate performance graphs. See

http://stackoverflow.com/questions/464960/code-golf-combining-multiple-sorted-lists-into-a-single-sorted-list#464967
"""
from __future__ import with_statement
from optparse import OptionParser, make_option
import collections
import copy
import glob
import itertools
import math
import operator
import os
import pickle
import re
import sys
import tempfile
import timeit
import warnings

# numpy, matplotlib used only for plotting
try:
    import numpy
    import pylab
except ImportError:
    warnings.warn("can't import numpy/pylab")

try:
    from merge_funcs import (merge_26, sort_builtin, merge_alabaster,
                             sort_deestan)
except ImportError:
    pass

__version__ = "1.0.0"


def make_sequence_creator(func, M):
    def f(n): return [func(n) for m in xrange(M)]
    f.__name__ = func.__name__
    return f


def ones(n):
    return [1] * n


def sorted_random(max_):
    def f(n):
        L = random.sample(xrange(max_), n)
        # XXX L.sort()
        return L
    f.__name__ = "sorted_random_%d" % max_
    return f


def eval_dottedname(dottedname):
    """
    >>> eval_dottedname("os.path.join") #doctest: +ELLIPSIS
    <function join at 0x...>
    >>> eval_dottedname("sys.exit") #doctest: +ELLIPSIS
    <built-in function exit>
    >>> eval_dottedname("sys") #doctest: +ELLIPSIS
    <module 'sys' (built-in)>
    """
    return reduce(getattr, dottedname.split(".")[1:], __import__(dottedname.partition(".")[0]))


def usec(function, arguments, readonly_args=False):
    """
    Given `function` and `arguments` return a time (in microseconds) it takes to make the call.

    Note: There is an overhead in executing a function that does nothing.

    http://www.rosettacode.org/wiki/Query_Performance#Python

    >>> 5 > usec(nothing, [], True) > 0
    True
    >>> 5 > usec(identity, [1], True) > 0
    True
    """
    funcname = function.__name__
    fd, pickle_fn = tempfile.mkstemp(
        suffix='.pickle', prefix='usec', text=True)
    pickle.dump((function, arguments), os.fdopen(fd, 'w'))
    del function
    del arguments  # free memory
    timer = timeit.Timer(stmt='%(funcname)s(*args)' % vars(),
                         setup='import pickle; %(funcname)s, args = pickle.load(open(%(pickle_fn)r))' % vars())

    try:
        try:
            seconds, N = 0, 1
            while seconds < 0.2:
                seconds = min(timer.repeat(repeat=3, number=N))
                N *= 10
                if not readonly_args:
                    break
            microseconds = round(1000000 * seconds / (N / 10), 1)  # per loop
            return microseconds
        except RuntimeError, err:
            # XXX for `merge_alabaster`
            if "maximum recursion depth exceeded" in err.message:
                print >> sys.stderr, err.message
                return 0
            else:
                raise
    except:
        timer.print_exc(file=sys.stderr)
        raise


def nothing(*args): pass


def identity(x): return x


def writedat(filename, x, y, xprecision=3, yprecision=5):
    r"""
    Write two equal-sized numerical arrays `x' and `y' to a two-column text file named `filename'.

    The first column of the file contains values from an `x'-array
    with a given `xprecision', the second -- values from `y'-array
    with `yprecision'.

    http://www.rosettacode.org/wiki/Write_float_arrays_to_a_text_file#Python

    Example usage

    >>> import math
    >>> x = [1, 2, 3, 1e11]
    >>> y = map(math.sqrt, x)
    >>> y
    [1.0, 1.4142135623730951, 1.7320508075688772, 316227.76601683791]
    >>> writedat('sqrt.dat', x, y)
    written 'sqrt.dat'
    >>> # check
    ...
    >>> open('sqrt.dat').read()
    '1\t1\n2\t1.4142\n3\t1.7321\n1e+11\t3.1623e+05\n'
    >>> import os
    >>> os.remove('sqrt.dat')
    """
    with open(filename, 'w') as f:
        for a, b in itertools.izip(x, y):
            print >> f, "%.*g\t%.*g" % (xprecision, a, yprecision, b)
    print "written '%s'" % filename


def write_timings(npoints, maxN, sort_functions, sequence_creators, prefix=''):
    """
    http://www.rosettacode.org/wiki/Measure_relative_performance_of_sorting_algorithms_implementations#Write_timings
    """
    maxexp = math.log(maxN, 2)
    step = (maxexp - 1) / npoints
    Ns = [2**e for e in range(1, int(maxexp + 1.5), int(step + .5) or 1)]
    for sort in sort_functions:
        for make_seq in sequence_creators:
            L = make_seq(10)
            Lcopy = copy.deepcopy(L)
            assert L == Lcopy
            _ = sort(L)
            _ = sort(L)
            Ts = map(lambda n: usec(sort, (make_seq(n),),
                                    readonly_args=(L == Lcopy)), Ns)
            writedat('%s%s-%s-%d-%d.xy' % (prefix, sort.__name__,  make_seq.__name__,
                                           len(Ns), max(Ns)), Ns, Ts)


def plotdd(dictplotdict, prefix=''):
    """See ``plot_timings()`` below."""
    symbols = list('o^v<>s+xDd1234hHp|_')
    colors = list('bgrcmyk')  # split string on distinct characters
    for npoints, plotdict in dictplotdict.iteritems():
        for ttle, lst in plotdict.iteritems():
            pylab.hold(False)
            for i, (label, polynom, x, y) in enumerate(sorted(lst, key=operator.itemgetter(0))):
                # dots
                pylab.plot(x, y, colors[i % len(colors)] + symbols[i % len(symbols)],
                           # legend
                           label='%.2g %s + %.2g %s' % (polynom[1], polynom.variable,
                                                        polynom[0], label))
                pylab.hold(True)  # plot on the same graph
                y = numpy.polyval(polynom, x)
                print polynom, label
                # lines
                pylab.plot(x, y, colors[i % len(colors)], label='_nolegend_')
            pylab.legend(loc='upper left')
            pylab.xlabel(polynom.variable)
            pylab.ylabel('log2( time in microseconds )')
            pylab.title(ttle, verticalalignment='bottom')
            figname = '%(prefix)s_%(npoints)03d%(ttle)s' % vars()
            pylab.savefig(figname + '.png')
            pylab.savefig(figname + '.pdf')
            print figname


def plot_timings(prefix=''):
    """
    http://www.rosettacode.org/wiki/Measure_relative_performance_of_sorting_algorithms_implementations#Plot_timings
    """
    def makedict(): return collections.defaultdict(
        lambda: collections.defaultdict(list))
    df = makedict()
    ds = makedict()
    # populate plot dictionaries
    for filename in glob.glob('%s*.xy' % prefix):
        m = re.match(r'%s([^-]+)-([^-]+)-(\d+)-(\d+)\.xy' % prefix, filename)
        print filename
        assert m, filename
        funcname, seqname, npoints, maxN = m.groups()
        npoints, maxN = int(npoints), int(maxN)
        Ns, Ts = numpy.loadtxt(filename, dtype='f', unpack=True)
        # Ns -- sequences lengths
        # Ts -- corresponding times
        assert len(Ns) == len(Ts) == npoints
        # find approximating polynom in a form log(y) = a*log(x) + b
        logsafe = numpy.logical_and(Ns > 0, Ts > 0)
        Ts = numpy.log2(Ts[logsafe])
        Ns = numpy.log2(Ns[logsafe])
        coeffs = numpy.polyfit(Ns, Ts, deg=1)
        poly = numpy.poly1d(coeffs, variable='log2(N)')
        #
        df[npoints][funcname].append((seqname, poly, Ns, Ts))
        ds[npoints][seqname].append((funcname, poly, Ns, Ts))
    # actual plotting
    plotdd(df, prefix)
    plotdd(ds, prefix)  # see ``plotdd()`` above


def dottednames2funcs(dottednames):
    return [eval_dottedname(f if "." in f else "%s.%s" % (__name__, f))
            for f in dottednames]


def main(args=None):
    """ Based on:
    http://www.rosettacode.org/wiki/Measure_relative_performance_of_sorting_algorithms_implementations#Figures:_log2.28_time_in_microseconds_.29_vs._log2.28_sequence_length_.29
    """
    # process args
    if args is None:
        args = sys.argv[1:]

    default_sort_functions = "merge_26 sort_builtin merge_funcs.merge_alabaster".split()

    p = OptionParser(description=__doc__, option_list=[
        make_option("--plot-only", action="store_true",
                    help="don't recompute *.xy files, just plot them"),
        make_option("-p", "--prefix",
                    help="a prefix for *.xy files (default: '%default')"),
        make_option("--npoints", type="int",
                    help="desirable number of sample point on the graph"),
        make_option("--nsublists", type="int",
                    help="number of sublists (default: %default)"),
        make_option("--maxn", type="int",
                    help="maximum size of sublists"),
        make_option("-s", "--sort-function", action="append",
                    help=("a sort function to plot, to add more functions to comparison"
                          " specify the option multiple times"
                          " (default: %s)" % default_sort_functions)),
        make_option("--sequence-creator", action="append",
                    help=("a sequence creator function accepts N return a sequence"
                          "(default: special case to test sorting multiple sorted lists)")),
    ])
    p.set_defaults(plot_only=False, prefix='',
                   npoints=100, nsublists=10, maxn=2**10)
    options, args = p.parse_args(args)
    if args:
        p.error("script does not accept arguments, but given %r" % args)

    if not options.plot_only:
        # measure performance and write acquired data to *.xy files
        M = options.nsublists  # number of sublists

        if not options.sequence_creator:
            sequence_creators = [make_sequence_creator(f, M)
                                 for f in (sorted_random(2**31 - 1), ones, range)]
        else:
            sequence_creators = dottednames2funcs(options.sequence_creator)

        if not options.sort_function:
            options.sort_function = default_sort_functions
        sort_functions = dottednames2funcs(options.sort_function)

        write_timings(npoints=options.npoints, maxN=options.maxn,
                      sort_functions=sort_functions,
                      sequence_creators=sequence_creators,
                      prefix=options.prefix)

    # plot *.xy files
    plot_timings(options.prefix)


if __name__ == '__main__':
    if '--test' in sys.argv:
        import doctest
        doctest.testmod()
        sys.argv.remove('--test')
    sys.exit(main(sys.argv[1:]))
	#!/usr/bin/env python2
	"""Generate performance graphs. See

	http://stackoverflow.com/questions/464960/code-golf-combining-multiple-sorted-lists-into-a-single-sorted-list#464967
	"""
	from __future__ import with_statement
	from optparse import OptionParser, make_option
	import collections
	import copy
	import glob
	import itertools
	import math
	import operator
	import os
	import pickle
	import re
	import sys
	import tempfile
	import timeit
	import warnings

	# numpy, matplotlib used only for plotting
	try:
	import numpy
	import pylab
	except ImportError:
	warnings.warn("can't import numpy/pylab")

	try:
	from merge_funcs import (merge_26, sort_builtin, merge_alabaster,
	sort_deestan)
	except ImportError:
	pass

	__version__ = "1.0.0"


	def make_sequence_creator(func, M):
	def f(n): return [func(n) for m in xrange(M)]
	f.__name__ = func.__name__
	return f


	def ones(n):
	return [1] * n


	def sorted_random(max_):
	def f(n):
	L = random.sample(xrange(max_), n)
	# XXX L.sort()
	return L
	f.__name__ = "sorted_random_%d" % max_
	return f


	def eval_dottedname(dottedname):
	"""
	>>> eval_dottedname("os.path.join") #doctest: +ELLIPSIS
	<function join at 0x...>
	>>> eval_dottedname("sys.exit") #doctest: +ELLIPSIS
	<built-in function exit>
	>>> eval_dottedname("sys") #doctest: +ELLIPSIS
	<module 'sys' (built-in)>
	"""
	return reduce(getattr, dottedname.split(".")[1:], __import__(dottedname.partition(".")[0]))


	def usec(function, arguments, readonly_args=False):
	"""
	Given `function` and `arguments` return a time (in microseconds) it takes to make the call.

	Note: There is an overhead in executing a function that does nothing.

	http://www.rosettacode.org/wiki/Query_Performance#Python

	>>> 5 > usec(nothing, [], True) > 0
	True
	>>> 5 > usec(identity, [1], True) > 0
	True
	"""
	funcname = function.__name__
	fd, pickle_fn = tempfile.mkstemp(
	suffix='.pickle', prefix='usec', text=True)
	pickle.dump((function, arguments), os.fdopen(fd, 'w'))
	del function
	del arguments # free memory
	timer = timeit.Timer(stmt='%(funcname)s(*args)' % vars(),
	setup='import pickle; %(funcname)s, args = pickle.load(open(%(pickle_fn)r))' % vars())

	try:
	try:
	seconds, N = 0, 1
	while seconds < 0.2:
	seconds = min(timer.repeat(repeat=3, number=N))
	N *= 10
	if not readonly_args:
	break
	microseconds = round(1000000 * seconds / (N / 10), 1) # per loop
	return microseconds
	except RuntimeError, err:
	# XXX for `merge_alabaster`
	if "maximum recursion depth exceeded" in err.message:
	print >> sys.stderr, err.message
	return 0
	else:
	raise
	except:
	timer.print_exc(file=sys.stderr)
	raise


	def nothing(*args): pass


	def identity(x): return x


	def writedat(filename, x, y, xprecision=3, yprecision=5):
	r"""
	Write two equal-sized numerical arrays `x' and `y' to a two-column text file named `filename'.

	The first column of the file contains values from an `x'-array
	with a given `xprecision', the second -- values from `y'-array
	with `yprecision'.

	http://www.rosettacode.org/wiki/Write_float_arrays_to_a_text_file#Python

	Example usage

	>>> import math
	>>> x = [1, 2, 3, 1e11]
	>>> y = map(math.sqrt, x)
	>>> y
	[1.0, 1.4142135623730951, 1.7320508075688772, 316227.76601683791]
	>>> writedat('sqrt.dat', x, y)
	written 'sqrt.dat'
	>>> # check
	...
	>>> open('sqrt.dat').read()
	'1\t1\n2\t1.4142\n3\t1.7321\n1e+11\t3.1623e+05\n'
	>>> import os
	>>> os.remove('sqrt.dat')
	"""
	with open(filename, 'w') as f:
	for a, b in itertools.izip(x, y):
	print >> f, "%.g\t%.g" % (xprecision, a, yprecision, b)
	print "written '%s'" % filename


	def write_timings(npoints, maxN, sort_functions, sequence_creators, prefix=''):
	"""
	http://www.rosettacode.org/wiki/Measure_relative_performance_of_sorting_algorithms_implementations#Write_timings
	"""
	maxexp = math.log(maxN, 2)
	step = (maxexp - 1) / npoints
	Ns = [2**e for e in range(1, int(maxexp + 1.5), int(step + .5) or 1)]
	for sort in sort_functions:
	for make_seq in sequence_creators:
	L = make_seq(10)
	Lcopy = copy.deepcopy(L)
	assert L == Lcopy
	_ = sort(L)
	_ = sort(L)
	Ts = map(lambda n: usec(sort, (make_seq(n),),
	readonly_args=(L == Lcopy)), Ns)
	writedat('%s%s-%s-%d-%d.xy' % (prefix, sort.__name__, make_seq.__name__,
	len(Ns), max(Ns)), Ns, Ts)


	def plotdd(dictplotdict, prefix=''):
	"""See ``plot_timings()`` below."""
	symbols = list('o^v<>s+xDd1234hHp\|_')
	colors = list('bgrcmyk') # split string on distinct characters
	for npoints, plotdict in dictplotdict.iteritems():
	for ttle, lst in plotdict.iteritems():
	pylab.hold(False)
	for i, (label, polynom, x, y) in enumerate(sorted(lst, key=operator.itemgetter(0))):
	# dots
	pylab.plot(x, y, colors[i % len(colors)] + symbols[i % len(symbols)],
	# legend
	label='%.2g %s + %.2g %s' % (polynom[1], polynom.variable,
	polynom[0], label))
	pylab.hold(True) # plot on the same graph
	y = numpy.polyval(polynom, x)
	print polynom, label
	# lines
	pylab.plot(x, y, colors[i % len(colors)], label='_nolegend_')
	pylab.legend(loc='upper left')
	pylab.xlabel(polynom.variable)
	pylab.ylabel('log2( time in microseconds )')
	pylab.title(ttle, verticalalignment='bottom')
	figname = '%(prefix)s_%(npoints)03d%(ttle)s' % vars()
	pylab.savefig(figname + '.png')
	pylab.savefig(figname + '.pdf')
	print figname


	def plot_timings(prefix=''):
	"""
	http://www.rosettacode.org/wiki/Measure_relative_performance_of_sorting_algorithms_implementations#Plot_timings
	"""
	def makedict(): return collections.defaultdict(
	lambda: collections.defaultdict(list))
	df = makedict()
	ds = makedict()
	# populate plot dictionaries
	for filename in glob.glob('%s*.xy' % prefix):
	m = re.match(r'%s([^-]+)-([^-]+)-(\d+)-(\d+)\.xy' % prefix, filename)
	print filename
	assert m, filename
	funcname, seqname, npoints, maxN = m.groups()
	npoints, maxN = int(npoints), int(maxN)
	Ns, Ts = numpy.loadtxt(filename, dtype='f', unpack=True)
	# Ns -- sequences lengths
	# Ts -- corresponding times
	assert len(Ns) == len(Ts) == npoints
	# find approximating polynom in a form log(y) = a*log(x) + b
	logsafe = numpy.logical_and(Ns > 0, Ts > 0)
	Ts = numpy.log2(Ts[logsafe])
	Ns = numpy.log2(Ns[logsafe])
	coeffs = numpy.polyfit(Ns, Ts, deg=1)
	poly = numpy.poly1d(coeffs, variable='log2(N)')
	#
	df[npoints][funcname].append((seqname, poly, Ns, Ts))
	ds[npoints][seqname].append((funcname, poly, Ns, Ts))
	# actual plotting
	plotdd(df, prefix)
	plotdd(ds, prefix) # see ``plotdd()`` above


	def dottednames2funcs(dottednames):
	return [eval_dottedname(f if "." in f else "%s.%s" % (__name__, f))
	for f in dottednames]


	def main(args=None):
	""" Based on:
	http://www.rosettacode.org/wiki/Measure_relative_performance_of_sorting_algorithms_implementations#Figures:_log2.28_time_in_microseconds_.29_vs._log2.28_sequence_length_.29
	"""
	# process args
	if args is None:
	args = sys.argv[1:]

	default_sort_functions = "merge_26 sort_builtin merge_funcs.merge_alabaster".split()

	p = OptionParser(description=__doc__, option_list=[
	make_option("--plot-only", action="store_true",
	help="don't recompute *.xy files, just plot them"),
	make_option("-p", "--prefix",
	help="a prefix for *.xy files (default: '%default')"),
	make_option("--npoints", type="int",
	help="desirable number of sample point on the graph"),
	make_option("--nsublists", type="int",
	help="number of sublists (default: %default)"),
	make_option("--maxn", type="int",
	help="maximum size of sublists"),
	make_option("-s", "--sort-function", action="append",
	help=("a sort function to plot, to add more functions to comparison"
	" specify the option multiple times"
	" (default: %s)" % default_sort_functions)),
	make_option("--sequence-creator", action="append",
	help=("a sequence creator function accepts N return a sequence"
	"(default: special case to test sorting multiple sorted lists)")),
	])
	p.set_defaults(plot_only=False, prefix='',
	npoints=100, nsublists=10, maxn=2**10)
	options, args = p.parse_args(args)
	if args:
	p.error("script does not accept arguments, but given %r" % args)

	if not options.plot_only:
	# measure performance and write acquired data to *.xy files
	M = options.nsublists # number of sublists

	if not options.sequence_creator:
	sequence_creators = [make_sequence_creator(f, M)
	for f in (sorted_random(2**31 - 1), ones, range)]
	else:
	sequence_creators = dottednames2funcs(options.sequence_creator)

	if not options.sort_function:
	options.sort_function = default_sort_functions
	sort_functions = dottednames2funcs(options.sort_function)

	write_timings(npoints=options.npoints, maxN=options.maxn,
	sort_functions=sort_functions,
	sequence_creators=sequence_creators,
	prefix=options.prefix)

	# plot *.xy files
	plot_timings(options.prefix)


	if __name__ == '__main__':
	if '--test' in sys.argv:
	import doctest
	doctest.testmod()
	sys.argv.remove('--test')
	sys.exit(main(sys.argv[1:]))