nowox/tupelize-benchmark

## tupelize-benchmark
#!/usr/bin/python
import cProfile, pstats, StringIO
from random import choice, random

import copy
import collections

class DataGenerator(object):

    depth = 0

    def __init__(self, max_depth = 3):
        self.max_depth = max_depth

    def gen_scalar(self, n):
        return ''.join(choice("abcdefghijklmnopqrstuvwxyz1234567890") for i in range(100))

    def gen_dict(self, n, duplicates=0.5):
        if self.depth > self.max_depth:
            return self.gen_scalar(n)
        self.depth += 1
        d = {}
        v = self.gen_any(n)
        for i in range(n):
            if random() > duplicates:
                v = self.gen_any(n)
            d[self.gen_scalar(n)] = v
        self.depth -= 1
        return d

    def gen_list(self, n):
        if self.depth > self.max_depth:
            return self.gen_scalar(n)
        self.depth += 1
        d = [self.gen_any(n) for i in range(n)]
        self.depth -= 1
        return d

    def gen_any(self, n):
        if self.depth > self.max_depth:
            return self.gen_scalar(n)
        return choice([self.gen_scalar, self.gen_dict, self.gen_list])(n)


def faithfulrepr(ds):
    """Returns a plain-text representation of a mixed seq/mapping/scalar
    type data structure.

    The datastructure is recursively ordered (ordereddict) then a
    dataset representation is returned.

    Args:

        ds  Dataset (mixed seq/mapping/scalar data structure)

    Returns:

        Sorted plain-text representation of the input dataset.
    """
    ds = copy.deepcopy(ds)
    if isinstance(ds, collections.Mapping):
        res = collections.OrderedDict()
        for k, v in sorted(ds.items()):
            res[k] = faithfulrepr(v)
        return repr(res)
    if isinstance(ds, list):
        for i, v in enumerate(ds):
            ds[i] = faithfulrepr(v)
        return repr(ds)
    return repr(ds)


def tupelize_dict(ds):
    """Group identical values of a dictionary under the same key. The keys
    become a tuple of all the duplicate keys.

    Args:

        ds: Input dictionary

    Example::

        ds = {23: 'foo',
              25: 'bar',
              28: 'foo',
              30: 'bar',
              33: 'foo'}

        >>>tupelize_dict(ds)
        {
            (23,28,33): 'foo',
            (25,30): 'bar'
        }
    """
    taxonomy = {}
    binder = collections.defaultdict(list)
    for key, value in ds.items():
        signature = faithfulrepr(value)
        taxonomy[signature] = value
        binder[signature].append(key)
    return {tuple(keys): taxonomy[s] for s, keys in binder.items()}

def tupelize_dict2(ds):
    keys, values = [], []
    for key, value in ds.items():
        try:
            index = values.index(value)
        except ValueError:
            values.append(value)
            keys.append([key])
        else:
            keys[index].append(key)
    return dict(zip(map(tuple, keys), values))

g = DataGenerator()
ds = g.gen_dict(30,0.5)
ds2 = copy.deepcopy(ds)

pr = cProfile.Profile()
pr.enable()
os1 = tupelize_dict(ds)
pr.disable()
s = StringIO.StringIO()
sortby = 'cumulative'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print s.getvalue()

pr = cProfile.Profile()
pr.enable()
os2 = tupelize_dict2(ds2)
pr.disable()
s = StringIO.StringIO()
sortby = 'cumulative'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print s.getvalue()

print os1 == os2
	#!/usr/bin/python
	import cProfile, pstats, StringIO
	from random import choice, random

	import copy
	import collections

	class DataGenerator(object):

	depth = 0

	def __init__(self, max_depth = 3):
	self.max_depth = max_depth

	def gen_scalar(self, n):
	return ''.join(choice("abcdefghijklmnopqrstuvwxyz1234567890") for i in range(100))

	def gen_dict(self, n, duplicates=0.5):
	if self.depth > self.max_depth:
	return self.gen_scalar(n)
	self.depth += 1
	d = {}
	v = self.gen_any(n)
	for i in range(n):
	if random() > duplicates:
	v = self.gen_any(n)
	d[self.gen_scalar(n)] = v
	self.depth -= 1
	return d

	def gen_list(self, n):
	if self.depth > self.max_depth:
	return self.gen_scalar(n)
	self.depth += 1
	d = [self.gen_any(n) for i in range(n)]
	self.depth -= 1
	return d

	def gen_any(self, n):
	if self.depth > self.max_depth:
	return self.gen_scalar(n)
	return choice([self.gen_scalar, self.gen_dict, self.gen_list])(n)


	def faithfulrepr(ds):
	"""Returns a plain-text representation of a mixed seq/mapping/scalar
	type data structure.

	The datastructure is recursively ordered (ordereddict) then a
	dataset representation is returned.

	Args:

	ds Dataset (mixed seq/mapping/scalar data structure)

	Returns:

	Sorted plain-text representation of the input dataset.
	"""
	ds = copy.deepcopy(ds)
	if isinstance(ds, collections.Mapping):
	res = collections.OrderedDict()
	for k, v in sorted(ds.items()):
	res[k] = faithfulrepr(v)
	return repr(res)
	if isinstance(ds, list):
	for i, v in enumerate(ds):
	ds[i] = faithfulrepr(v)
	return repr(ds)
	return repr(ds)


	def tupelize_dict(ds):
	"""Group identical values of a dictionary under the same key. The keys
	become a tuple of all the duplicate keys.

	Args:

	ds: Input dictionary

	Example::

	ds = {23: 'foo',
	25: 'bar',
	28: 'foo',
	30: 'bar',
	33: 'foo'}

	>>>tupelize_dict(ds)
	{
	(23,28,33): 'foo',
	(25,30): 'bar'
	}
	"""
	taxonomy = {}
	binder = collections.defaultdict(list)
	for key, value in ds.items():
	signature = faithfulrepr(value)
	taxonomy[signature] = value
	binder[signature].append(key)
	return {tuple(keys): taxonomy[s] for s, keys in binder.items()}

	def tupelize_dict2(ds):
	keys, values = [], []
	for key, value in ds.items():
	try:
	index = values.index(value)
	except ValueError:
	values.append(value)
	keys.append([key])
	else:
	keys[index].append(key)
	return dict(zip(map(tuple, keys), values))

	g = DataGenerator()
	ds = g.gen_dict(30,0.5)
	ds2 = copy.deepcopy(ds)

	pr = cProfile.Profile()
	pr.enable()
	os1 = tupelize_dict(ds)
	pr.disable()
	s = StringIO.StringIO()
	sortby = 'cumulative'
	ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
	ps.print_stats()
	print s.getvalue()

	pr = cProfile.Profile()
	pr.enable()
	os2 = tupelize_dict2(ds2)
	pr.disable()
	s = StringIO.StringIO()
	sortby = 'cumulative'
	ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
	ps.print_stats()
	print s.getvalue()

	print os1 == os2