Skip to content

Instantly share code, notes, and snippets.

@nowox
Created December 6, 2016 16:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nowox/76b0d405778387ebccc241be118dbed7 to your computer and use it in GitHub Desktop.
Save nowox/76b0d405778387ebccc241be118dbed7 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
import cProfile, pstats, StringIO
from random import choice, random
import copy
import collections
class DataGenerator(object):
depth = 0
def __init__(self, max_depth = 3):
self.max_depth = max_depth
def gen_scalar(self, n):
return ''.join(choice("abcdefghijklmnopqrstuvwxyz1234567890") for i in range(100))
def gen_dict(self, n, duplicates=0.5):
if self.depth > self.max_depth:
return self.gen_scalar(n)
self.depth += 1
d = {}
v = self.gen_any(n)
for i in range(n):
if random() > duplicates:
v = self.gen_any(n)
d[self.gen_scalar(n)] = v
self.depth -= 1
return d
def gen_list(self, n):
if self.depth > self.max_depth:
return self.gen_scalar(n)
self.depth += 1
d = [self.gen_any(n) for i in range(n)]
self.depth -= 1
return d
def gen_any(self, n):
if self.depth > self.max_depth:
return self.gen_scalar(n)
return choice([self.gen_scalar, self.gen_dict, self.gen_list])(n)
def faithfulrepr(ds):
"""Returns a plain-text representation of a mixed seq/mapping/scalar
type data structure.
The datastructure is recursively ordered (ordereddict) then a
dataset representation is returned.
Args:
ds Dataset (mixed seq/mapping/scalar data structure)
Returns:
Sorted plain-text representation of the input dataset.
"""
ds = copy.deepcopy(ds)
if isinstance(ds, collections.Mapping):
res = collections.OrderedDict()
for k, v in sorted(ds.items()):
res[k] = faithfulrepr(v)
return repr(res)
if isinstance(ds, list):
for i, v in enumerate(ds):
ds[i] = faithfulrepr(v)
return repr(ds)
return repr(ds)
def tupelize_dict(ds):
"""Group identical values of a dictionary under the same key. The keys
become a tuple of all the duplicate keys.
Args:
ds: Input dictionary
Example::
ds = {23: 'foo',
25: 'bar',
28: 'foo',
30: 'bar',
33: 'foo'}
>>>tupelize_dict(ds)
{
(23,28,33): 'foo',
(25,30): 'bar'
}
"""
taxonomy = {}
binder = collections.defaultdict(list)
for key, value in ds.items():
signature = faithfulrepr(value)
taxonomy[signature] = value
binder[signature].append(key)
return {tuple(keys): taxonomy[s] for s, keys in binder.items()}
def tupelize_dict2(ds):
keys, values = [], []
for key, value in ds.items():
try:
index = values.index(value)
except ValueError:
values.append(value)
keys.append([key])
else:
keys[index].append(key)
return dict(zip(map(tuple, keys), values))
g = DataGenerator()
ds = g.gen_dict(30,0.5)
ds2 = copy.deepcopy(ds)
pr = cProfile.Profile()
pr.enable()
os1 = tupelize_dict(ds)
pr.disable()
s = StringIO.StringIO()
sortby = 'cumulative'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print s.getvalue()
pr = cProfile.Profile()
pr.enable()
os2 = tupelize_dict2(ds2)
pr.disable()
s = StringIO.StringIO()
sortby = 'cumulative'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print s.getvalue()
print os1 == os2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment