Created
December 6, 2016 16:30
-
-
Save nowox/76b0d405778387ebccc241be118dbed7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import cProfile, pstats, StringIO | |
from random import choice, random | |
import copy | |
import collections | |
class DataGenerator(object): | |
depth = 0 | |
def __init__(self, max_depth = 3): | |
self.max_depth = max_depth | |
def gen_scalar(self, n): | |
return ''.join(choice("abcdefghijklmnopqrstuvwxyz1234567890") for i in range(100)) | |
def gen_dict(self, n, duplicates=0.5): | |
if self.depth > self.max_depth: | |
return self.gen_scalar(n) | |
self.depth += 1 | |
d = {} | |
v = self.gen_any(n) | |
for i in range(n): | |
if random() > duplicates: | |
v = self.gen_any(n) | |
d[self.gen_scalar(n)] = v | |
self.depth -= 1 | |
return d | |
def gen_list(self, n): | |
if self.depth > self.max_depth: | |
return self.gen_scalar(n) | |
self.depth += 1 | |
d = [self.gen_any(n) for i in range(n)] | |
self.depth -= 1 | |
return d | |
def gen_any(self, n): | |
if self.depth > self.max_depth: | |
return self.gen_scalar(n) | |
return choice([self.gen_scalar, self.gen_dict, self.gen_list])(n) | |
def faithfulrepr(ds): | |
"""Returns a plain-text representation of a mixed seq/mapping/scalar | |
type data structure. | |
The datastructure is recursively ordered (ordereddict) then a | |
dataset representation is returned. | |
Args: | |
ds Dataset (mixed seq/mapping/scalar data structure) | |
Returns: | |
Sorted plain-text representation of the input dataset. | |
""" | |
ds = copy.deepcopy(ds) | |
if isinstance(ds, collections.Mapping): | |
res = collections.OrderedDict() | |
for k, v in sorted(ds.items()): | |
res[k] = faithfulrepr(v) | |
return repr(res) | |
if isinstance(ds, list): | |
for i, v in enumerate(ds): | |
ds[i] = faithfulrepr(v) | |
return repr(ds) | |
return repr(ds) | |
def tupelize_dict(ds): | |
"""Group identical values of a dictionary under the same key. The keys | |
become a tuple of all the duplicate keys. | |
Args: | |
ds: Input dictionary | |
Example:: | |
ds = {23: 'foo', | |
25: 'bar', | |
28: 'foo', | |
30: 'bar', | |
33: 'foo'} | |
>>>tupelize_dict(ds) | |
{ | |
(23,28,33): 'foo', | |
(25,30): 'bar' | |
} | |
""" | |
taxonomy = {} | |
binder = collections.defaultdict(list) | |
for key, value in ds.items(): | |
signature = faithfulrepr(value) | |
taxonomy[signature] = value | |
binder[signature].append(key) | |
return {tuple(keys): taxonomy[s] for s, keys in binder.items()} | |
def tupelize_dict2(ds): | |
keys, values = [], [] | |
for key, value in ds.items(): | |
try: | |
index = values.index(value) | |
except ValueError: | |
values.append(value) | |
keys.append([key]) | |
else: | |
keys[index].append(key) | |
return dict(zip(map(tuple, keys), values)) | |
g = DataGenerator() | |
ds = g.gen_dict(30,0.5) | |
ds2 = copy.deepcopy(ds) | |
pr = cProfile.Profile() | |
pr.enable() | |
os1 = tupelize_dict(ds) | |
pr.disable() | |
s = StringIO.StringIO() | |
sortby = 'cumulative' | |
ps = pstats.Stats(pr, stream=s).sort_stats(sortby) | |
ps.print_stats() | |
print s.getvalue() | |
pr = cProfile.Profile() | |
pr.enable() | |
os2 = tupelize_dict2(ds2) | |
pr.disable() | |
s = StringIO.StringIO() | |
sortby = 'cumulative' | |
ps = pstats.Stats(pr, stream=s).sort_stats(sortby) | |
ps.print_stats() | |
print s.getvalue() | |
print os1 == os2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment