Created
December 6, 2012 07:42
-
-
Save zed/4babbce1179d77272b68 to your computer and use it in GitHub Desktop.
measure performance of various solution for "Replace list of list with “condensed” list of list while maintaining order" problem
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
from collections import OrderedDict, defaultdict | |
from copy import deepcopy | |
from itertools import chain, combinations_with_replacement, repeat | |
from random import randrange | |
imap = map # Python 3 | |
try: | |
from itertools import imap | |
except ImportError: | |
pass | |
from reporttime import accept_funcs, get_functions_with_prefix, measure | |
# input data for testing, performance measurements | |
_a = [1, 2, 3] | |
_b = [3, 4] | |
_i = [21, 22] | |
_c = [88, 7, 8] | |
_e = [5, 4] | |
_d = [3, 50] | |
_f = [8, 9] | |
_g = [9, 10] | |
_h = [20, 21] | |
lst_OP = [_a, _b, _c, _i, _e, _d, _f, _g, _h, _a, _c, _i] * 1000 | |
lst_BK = ([list(range(4 * ii, 4 * (ii + 1) - 1)) | |
for ii in range(300)] + | |
[list(range(4 * ii + 2, 4 * (ii + 1) + 1)) | |
for ii in range(300)]) | |
lst_rnd = [[randrange(10 ** 3) for _ in repeat(None, randrange(100))] | |
for _ in repeat(None, 1000)] | |
def condense_blckknght(master_list): | |
""" | |
http://stackoverflow.com/a/13716804 | |
""" | |
values = {} | |
overlaps = {} | |
for i, sublist in enumerate(master_list): | |
for v in sublist: | |
if v in values: | |
current = i | |
while current in overlaps: | |
current = overlaps[current] | |
other = values[v] | |
while other in overlaps: | |
other = overlaps[other] | |
if current < other: | |
overlaps[other] = current | |
elif other < current: | |
overlaps[current] = other | |
# else: #current == other: # do nothing | |
else: | |
values[v] = i | |
output = [] | |
output_map = {} | |
for i, sublist in enumerate(master_list): | |
if i not in overlaps: | |
output_map[i] = len(output) | |
output.append(sublist[:]) | |
else: | |
root = overlaps[i] | |
while root in overlaps: | |
root = overlaps[root] | |
for v in sublist: | |
if values[v] == i: | |
output[output_map[root]].append(v) | |
return output | |
def disjoint_set_find(djs, node): # disjoint set find, with path | |
# compression | |
if node not in djs: # base case, node is a root of a set | |
return node | |
djs[node] = disjoint_set_find(djs, djs[node]) # recurse, caching results | |
return djs[node] | |
def condense_blckknght2(master_list): | |
""" | |
http://stackoverflow.com/a/13716804 | |
""" | |
values = {} # maps each value to the first sublist that contains it | |
overlaps = {} # maps sublist indexes to each other to form a disjoint set | |
for i, sublist in enumerate(master_list): | |
for v in sublist: | |
if v not in values: # no overlap, so just store value | |
values[v] = i | |
else: # overlap detected, do a disjoint set union | |
current_set = disjoint_set_find(overlaps, i) | |
other_set = disjoint_set_find(overlaps, values[v]) | |
if current_set < other_set: | |
overlaps[other_set] = current_set | |
elif other_set < current_set: | |
overlaps[current_set] = other_set | |
# ignore case where sets are equal | |
output = [] # results | |
output_map = {} # map from original indexes to output indexes | |
for i, sublist in enumerate(master_list): | |
if i not in overlaps: # this is a root of a disjoint set | |
output_map[i] = len(output) # update index output mapping | |
output.append(sublist[:]) # copy sublist into output * FIXME | |
else: | |
root = disjoint_set_find(overlaps, i) | |
for v in sublist: # add our unique elements into output list | |
if values[v] == i: # keep only in the first list * FIXME | |
output[output_map[root]].append(v) | |
return output | |
def condense_blckknght3(master_list): | |
""" | |
http://stackoverflow.com/a/13716804 | |
""" | |
values = {} # maps each value to the first sublist that contains it | |
overlaps = {} # maps sublist indexes to each other to form a disjoint set | |
for i, sublist in enumerate(master_list): | |
for v in sublist: | |
if v not in values: # no overlap, so just store value | |
values[v] = i | |
else: # overlap detected, do a disjoint set union | |
current_set = disjoint_set_find(overlaps, i) | |
other_set = disjoint_set_find(overlaps, values[v]) | |
if current_set < other_set: | |
overlaps[other_set] = current_set | |
elif other_set < current_set: | |
overlaps[current_set] = other_set | |
# ignore case where sets are equal | |
output = [] # results | |
index = {} # map from original indexes to output indexes | |
for i, sublist in enumerate(master_list): | |
if i not in overlaps: # this is a root of a disjoint set | |
index[i] = len(output) # update index output mapping | |
output.append([]) | |
append = output[index[i]].append | |
else: | |
root = disjoint_set_find(overlaps, i) | |
append = output[index[root]].append | |
# copy sublist into output removing duplicates | |
for v in sublist: # add our unique elements into output list | |
if values[v] is not None: # not seen, add to output | |
append(v) | |
values[v] = None # mark as already seen | |
return output | |
def disjoint_set_union(djs, first, second): | |
first = disjoint_set_find(djs, first) # find root of first set | |
second = disjoint_set_find(djs, second) # and of second set | |
if first < second: # make smaller root the root of the new combined set | |
djs[second] = first | |
elif second < first: | |
djs[first] = second | |
# deliberately ignore the case where first == second (same set) | |
def condense_BK(master_list): | |
""" | |
http://stackoverflow.com/a/13716804 | |
""" | |
values = OrderedDict() # maps values to the first sublist | |
# containing them | |
overlaps = {} # maps sublist indexes to each other to form a disjoint set | |
for i, sublist in enumerate(master_list): | |
for v in sublist: | |
if v not in values: # no overlap, so just store value | |
values[v] = i | |
else: # overlap detected, do a disjoint set union | |
disjoint_set_union(overlaps, values[v], i) | |
output = [] # results | |
output_map = {} # map from original indexes to output indexes | |
for v, i, in values.items(): # iterate over values in order | |
root = disjoint_set_find(overlaps, i) | |
if root not in output_map: | |
output_map[i] = len(output) | |
output.append([]) # create new output sublists as necessary | |
output[output_map[root]].append(v) | |
return output | |
def condense_jfs(lists): | |
""" | |
http://stackoverflow.com/a/13715626 | |
""" | |
# remember original positions | |
positions = {} | |
for pos, item in enumerate(chain.from_iterable(lists)): | |
if item not in positions: | |
positions[item] = pos | |
# condense disregarding order | |
sets = _condense_sets(set(L) for L in lists) | |
# restore order | |
return [sorted(s, key=positions.get) for s in sets] or [[]] | |
def _condense_sets(sets): | |
result = [] | |
for candidate in sets: | |
for current in result: | |
if candidate & current: # found overlap | |
current |= candidate # combine (merge sets) | |
# new items from candidate may create an overlap | |
# between current set and the remaining result sets | |
result = _condense_sets(result) # merge such sets | |
break | |
else: # no common elements found (or result is empty) | |
result.append(candidate) | |
return result | |
def _condense_sberry(lst): # raises IndexError on lst_BK | |
""" | |
http://stackoverflow.com/a/13715377 | |
""" | |
lookup = {} | |
out = [] | |
index = 0 | |
for grp in lst: | |
keys = [lookup.get(num, None) for num in grp] | |
keys = [key for key in keys if key is not None] | |
if len(keys): | |
if len(set(keys)) != 1: | |
for num in grp: | |
out[keys[0]].append(num) | |
seen = set() | |
keys = [key for key in keys | |
if key not in seen and not seen.add(key)] | |
for key in keys[1:]: | |
out[keys[0]].extend(out[key]) | |
del out[key] | |
seen = set() | |
out[keys[0]] = [item for item in out[keys[0]] | |
if item not in seen and not seen.add(item)] | |
else: | |
for num in grp: | |
lookup[num] = keys[0] | |
out[keys[0]].extend(grp) | |
seen = set() | |
out[keys[0]] = [item for item in out[keys[0]] | |
if item not in seen and not seen.add(item)] | |
else: | |
out.append(grp) | |
for num in grp: | |
lookup[num] = index | |
index += 1 | |
return out | |
def __condense_eyquem(passed_list): # fail tests | |
""" | |
http://stackoverflow.com/a/13716448 | |
""" | |
L = deepcopy(passed_list) | |
dic = OrderedDict() | |
for subl in L: | |
for x in subl: | |
if x not in dic: | |
dic[x] = subl | |
##print 'dic at start' | |
##print '\n'.join('%-3s : %s' % (a,dic[a]) | |
## for a in dic) + '\n' | |
for sublist in L: | |
short = [] | |
short.extend(el for el in sublist | |
if el not in short) | |
seen = [] | |
for k, val in dic.items(): | |
if val is sublist: | |
break | |
if k in short: | |
if val not in seen: | |
seen.append(val) | |
sumseen = [] | |
for elseen in seen: | |
for y in elseen: | |
sumseen.append(y) | |
dic[y] = sumseen | |
if seen: | |
for el in sublist: | |
if el not in sumseen: | |
sumseen.append(el) | |
dic[el] = sumseen | |
sublist[:] = short | |
cumul = [] | |
cumul.extend(lu for lu in dic.values() | |
if lu not in cumul) | |
return cumul | |
def connected_components(edges): | |
neighbors = defaultdict(set) | |
for a, b in edges: | |
neighbors[a].add(b) | |
neighbors[b].add(a) | |
seen = set() | |
def component(node, neighbors=neighbors, seen=seen, see=seen.add): | |
unseen = set([node]) | |
next_unseen = unseen.pop | |
while unseen: | |
node = next_unseen() | |
see(node) | |
unseen |= neighbors[node] - seen | |
yield node | |
for node in neighbors: | |
if node not in seen: | |
yield component(node) | |
def _condense_pillmuncher(lists): | |
""" | |
http://stackoverflow.com/a/13736192/ | |
""" | |
tuples = combinations_with_replacement(enumerate(imap(tuple, lists)), 2) | |
overlapping = ((a, b) for a, b in tuples | |
if not set(a[1]).isdisjoint(b[1])) | |
seen = set() | |
see = seen.add | |
for component in connected_components(overlapping): | |
yield [item for each in sorted(component) | |
for item in each[1] | |
if not (item in seen or see(item))] | |
def _condense_pillmuncher2(lists): | |
""" | |
http://stackoverflow.com/a/13736192/ | |
""" | |
neighbors = defaultdict(set) | |
pos = {} | |
i = 0 | |
for each in lists: | |
for i, a in enumerate(each, i + 1): | |
neighbors[a].update(each) | |
if a not in pos: | |
pos[a] = i | |
seen = set() | |
def component(node, neighbors=neighbors, seen=seen, see=seen.add): | |
unseen = set([node]) | |
next_unseen = unseen.pop | |
while unseen: | |
node = next_unseen() | |
see(node) | |
unseen |= neighbors[node] - seen | |
yield node | |
for node in neighbors: | |
if node not in seen: | |
yield sorted(component(node), key=pos.get) | |
def __condense_pillmuncher(lists): # slow | |
return list(_condense_pillmuncher(lists)) | |
def condense_pillmuncher2(lists): | |
return list(_condense_pillmuncher2(lists)) | |
def condense_hynekcer(lists): | |
""" | |
http://stackoverflow.com/a/13896383 | |
""" | |
groups = {} # items divided into groups {id(the_set): the_set,...} | |
members = {} # mapping from item to group | |
positions = {} # mapping from item to sequential ordering | |
iposition = 0 # counter for positions | |
for sublist in lists: | |
if not sublist or members.get(sublist[0], set()).issuperset(sublist): | |
continue # speed-up condition if all is from one group | |
common = set([x for x in sublist if x in members]) | |
if common: # any common group can be a destination for merge | |
dst_group = members[common.pop()] | |
common = common - dst_group # are more groups common for sublist? | |
while common: | |
grp = members[common.pop()] | |
if len(grp) > len(dst_group): # merge shorter into longer grp | |
grp, dst_group = dst_group, grp | |
dst_group.update(grp) | |
for item in grp: | |
members[item] = dst_group | |
del groups[id(grp)] | |
common = common - dst_group | |
else: # or create a new group if nothing common | |
dst_group = set() | |
groups[id(dst_group)] = dst_group | |
newitems = [] | |
for item in sublist: # merge also new items | |
if item not in positions: | |
positions[item] = iposition | |
iposition += 1 | |
newitems.append(item) | |
members[item] = dst_group | |
dst_group.update(newitems) | |
return [sorted(x, key=positions.get) for x in groups.values()] | |
def _condense_hynekcer2(lists): | |
""" | |
http://stackoverflow.com/a/13896383 | |
""" | |
neighbors = defaultdict(set) # mapping from items to sublists | |
positions = {} # mapping from items to sequential ordering | |
position = 0 | |
for each in lists: | |
for item in each: | |
neighbors[item].update(each) | |
if item not in positions: | |
positions[item] = position | |
position += 1 | |
seen = set() | |
see = seen.add | |
for node in neighbors: | |
if node not in seen: | |
unseen = set([node]) # this is a "todo" set | |
next_unseen = unseen.pop # method alias, not called now | |
group = [] # collects the output | |
while unseen: | |
node = next_unseen() | |
see(node) | |
unseen |= neighbors[node] - seen | |
group.append(node) | |
yield sorted(group, key=positions.get) | |
def condense_hynekcer2(lists): | |
return list(_condense_hynekcer2(lists)) | |
def __condense_qarma(input): # fail tests | |
""" | |
http://stackoverflow.com/a/13951145 | |
""" | |
def unique(l): | |
s = set() | |
for i in l: | |
if i not in s: | |
s.add(i) | |
yield(i) | |
tmp = list(unique(sum(input, []))) | |
tmp = filter(None, [filter(lambda x: isinstance(x, str), tmp), | |
filter(lambda x: not isinstance(x, str), tmp)]) | |
# if only numbers or only strings are present, flatten the list | |
if len(tmp) == 1: | |
tmp = tmp[0] | |
return tmp | |
@accept_funcs | |
def test_funcs(funcs, args, ignore_order=True): | |
"""Check that all functions produce the same result. | |
Each function should accept a readonly sequence as an argument | |
""" | |
orig_args = deepcopy(args) | |
expected = sorted(funcs[0][1](*args)) # pylint: disable=W0142 | |
for name, f in funcs: | |
r = f(*args) # pylint: disable=W0142 | |
if ignore_order: | |
r.sort() | |
assert r == expected, (name, args, expected) | |
assert orig_args == args, name | |
def main(): | |
mod = sys.modules[__name__] | |
funcs = get_functions_with_prefix('condense_', module=mod) | |
for args in [lst_BK] * 2 + [lst_OP] * 2: | |
test_funcs(funcs, [args]) | |
verbose = '--verbose' in sys.argv | |
for name in "lst_OP lst_BK lst_rnd".split(): | |
lst = getattr(mod, name) | |
measure(funcs, args=[lst], comment=name, verbose=verbose) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import inspect | |
import sys | |
import timeit | |
from functools import partial, wraps | |
from timeit import default_timer as timer | |
def measure_func(func, args): | |
"""Measure how long `func(*args)` takes in seconds.""" | |
n = 1 | |
f = partial(func, *args) # pylint: disable=W0142 | |
while True: | |
start = timer() | |
r = timeit.repeat(f, number=n, repeat=1) | |
if timer() - start > 1: # at least 1 second per measurement | |
break | |
n *= 2 | |
return min(r + timeit.repeat(f, number=n, repeat=2)) / n | |
def accept_funcs(func): | |
"""Add function names if necessary. | |
Decorator for functions that accept either a sequence of functions | |
or (name, function) pairs. | |
""" | |
@wraps(func) | |
def wrapper(funcs, *args, **kwargs): | |
if hasattr(funcs[0], '__name__'): | |
funcs = [(f.__name__, f) for f in funcs] | |
return func(funcs, *args, **kwargs) | |
return wrapper | |
def human_seconds(seconds, fmt="%.3g %s"): | |
"""Return human-readable string that represents given seconds.""" | |
t = 1e6 * seconds # start with µsec | |
for suff in "usec msec".split(): | |
if t < 1000: | |
return fmt % (t, suff) | |
t /= 1000 | |
return fmt % (t, " sec") | |
@accept_funcs | |
def measure(funcs, args, comment='', verbose=False): | |
"""Report how long `f(*args)` takes for each f in funcs.""" | |
# measure performance | |
results = [] | |
w = max(len(name) for name, _ in funcs) | |
for name, f in funcs: | |
results.append((measure_func(f, args), name)) | |
if verbose: | |
print("{:{}s} {:>9s} {}".format( | |
name, w, human_seconds(results[-1][0]), comment)) | |
# print sorted results | |
results.sort() | |
mint = results[0][0] # minimal time | |
ratios = ["%5.2f" % (t / mint,) for t, _ in results] | |
maxratio_width = max(len(r) for r in ratios) | |
# header | |
print("{:{}s} {:>9s} {:>{}s} {}".format( | |
"name", w, "time", "ratio", maxratio_width, "comment")) | |
ratios = [s.rjust(maxratio_width) for s in ratios] | |
for (t, name), ratio in zip(results, ratios): | |
print("{:{}s} {:>9s} {} {}".format( | |
name, w, human_seconds(t), ratio, comment)) | |
def get_functions_with_prefix(prefix, module=None): | |
if module is None: # use caller's module | |
modname = inspect.currentframe().f_back.f_globals['__name__'] | |
module = sys.modules[modname] | |
all_funcs = inspect.getmembers(module, inspect.isfunction) | |
return [f for name, f in all_funcs if name.startswith(prefix)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name time ratio comment | |
condense_hynekcer 5.79 msec 1.00 lst_OP | |
condense_hynekcer2 7.4 msec 1.28 lst_OP | |
condense_pillmuncher2 11.5 msec 1.99 lst_OP | |
condense_blckknght 16.8 msec 2.91 lst_OP | |
condense_jfs 26 msec 4.49 lst_OP | |
condense_BK 30.5 msec 5.28 lst_OP | |
condense_blckknght2 30.9 msec 5.34 lst_OP | |
condense_blckknght3 32.5 msec 5.62 lst_OP | |
name time ratio comment | |
condense_blckknght 964 usec 1.00 lst_BK | |
condense_blckknght2 1.41 msec 1.47 lst_BK | |
condense_blckknght3 1.46 msec 1.51 lst_BK | |
condense_hynekcer2 1.95 msec 2.03 lst_BK | |
condense_pillmuncher2 2.1 msec 2.18 lst_BK | |
condense_hynekcer 2.12 msec 2.20 lst_BK | |
condense_BK 3.39 msec 3.52 lst_BK | |
condense_jfs 544 msec 563.66 lst_BK | |
name time ratio comment | |
condense_hynekcer 6.86 msec 1.00 lst_rnd | |
condense_jfs 16.8 msec 2.44 lst_rnd | |
condense_blckknght 28.6 msec 4.16 lst_rnd | |
condense_blckknght2 56.1 msec 8.18 lst_rnd | |
condense_blckknght3 56.3 msec 8.21 lst_rnd | |
condense_BK 70.2 msec 10.23 lst_rnd | |
condense_pillmuncher2 324 msec 47.22 lst_rnd | |
condense_hynekcer2 334 msec 48.61 lst_rnd |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment