Created
December 6, 2012 07:42
-
-
Save zed/4babbce1179d77272b68 to your computer and use it in GitHub Desktop.
measure performance of various solution for "Replace list of list with “condensed” list of list while maintaining order" problem
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from collections import OrderedDict, defaultdict | |
from itertools import chain, combinations_with_replacement | |
from copy import deepcopy | |
imap = map # Python 3 | |
try: | |
from itertools import imap | |
except ImportError: | |
pass | |
from reporttime import accept_funcs, get_functions_with_prefix, measure | |
# input data for testing, performance measurements | |
_a = [1, 2, 3] | |
_b = [3, 4] | |
_i = [21, 22] | |
_c = [88, 7, 8] | |
_e = [5, 4] | |
_d = [3, 50] | |
_f = [8, 9] | |
_g = [9, 10] | |
_h = [20, 21] | |
lst_OP = [_a, _b, _c, _i, _e, _d, _f, _g, _h, _a, _c, _i] * 1000 | |
lst_BK = ([list(range(4 * ii, 4 * (ii + 1) - 1)) | |
for ii in range(300)] + | |
[list(range(4 * ii + 2, 4 * (ii + 1) + 1)) | |
for ii in range(300)]) | |
def condense_blckknght(master_list): | |
""" | |
http://stackoverflow.com/a/13716804 | |
""" | |
values = {} | |
overlaps = {} | |
for i, sublist in enumerate(master_list): | |
for v in sublist: | |
if v in values: | |
current = i | |
while current in overlaps: | |
current = overlaps[current] | |
other = values[v] | |
while other in overlaps: | |
other = overlaps[other] | |
if current < other: | |
overlaps[other] = current | |
elif other < current: | |
overlaps[current] = other | |
# else: #current == other: # do nothing | |
else: | |
values[v] = i | |
output = [] | |
output_map = {} | |
for i, sublist in enumerate(master_list): | |
if i not in overlaps: | |
output_map[i] = len(output) | |
output.append(sublist[:]) | |
else: | |
root = overlaps[i] | |
while root in overlaps: | |
root = overlaps[root] | |
for v in sublist: | |
if values[v] == i: | |
output[output_map[root]].append(v) | |
return output | |
def disjoint_set_find(djs, node): # disjoint set find, with path | |
# compression | |
if node not in djs: # base case, node is a root of a set | |
return node | |
djs[node] = disjoint_set_find(djs, djs[node]) # recurse, caching results | |
return djs[node] | |
def condense_blckknght2(master_list): | |
""" | |
http://stackoverflow.com/a/13716804 | |
""" | |
values = {} # maps each value to the first sublist that contains it | |
overlaps = {} # maps sublist indexes to each other to form a disjoint set | |
for i, sublist in enumerate(master_list): | |
for v in sublist: | |
if v not in values: # no overlap, so just store value | |
values[v] = i | |
else: # overlap detected, do a disjoint set union | |
current_set = disjoint_set_find(overlaps, i) | |
other_set = disjoint_set_find(overlaps, values[v]) | |
if current_set < other_set: | |
overlaps[other_set] = current_set | |
elif other_set < current_set: | |
overlaps[current_set] = other_set | |
# ignore case where sets are equal | |
output = [] # results | |
output_map = {} # map from original indexes to output indexes | |
for i, sublist in enumerate(master_list): | |
if i not in overlaps: # this is a root of a disjoint set | |
output_map[i] = len(output) # update index output mapping | |
output.append(sublist[:]) # copy sublist into output * FIXME | |
else: | |
root = disjoint_set_find(overlaps, i) | |
for v in sublist: # add our unique elements into output list | |
if values[v] == i: # keep only in the first list * FIXME | |
output[output_map[root]].append(v) | |
return output | |
def condense_blckknght3(master_list): | |
""" | |
http://stackoverflow.com/a/13716804 | |
""" | |
values = {} # maps each value to the first sublist that contains it | |
overlaps = {} # maps sublist indexes to each other to form a disjoint set | |
for i, sublist in enumerate(master_list): | |
for v in sublist: | |
if v not in values: # no overlap, so just store value | |
values[v] = i | |
else: # overlap detected, do a disjoint set union | |
current_set = disjoint_set_find(overlaps, i) | |
other_set = disjoint_set_find(overlaps, values[v]) | |
if current_set < other_set: | |
overlaps[other_set] = current_set | |
elif other_set < current_set: | |
overlaps[current_set] = other_set | |
# ignore case where sets are equal | |
output = [] # results | |
index = {} # map from original indexes to output indexes | |
for i, sublist in enumerate(master_list): | |
if i not in overlaps: # this is a root of a disjoint set | |
index[i] = len(output) # update index output mapping | |
output.append([]) | |
append = output[index[i]].append | |
else: | |
root = disjoint_set_find(overlaps, i) | |
append = output[index[root]].append | |
# copy sublist into output removing duplicates | |
for v in sublist: # add our unique elements into output list | |
if values[v] is not None: # not seen, add to output | |
append(v) | |
values[v] = None # mark as already seen | |
return output | |
def disjoint_set_union(djs, first, second): | |
first = disjoint_set_find(djs, first) # find root of first set | |
second = disjoint_set_find(djs, second) # and of second set | |
if first < second: # make smaller root the root of the new combined set | |
djs[second] = first | |
elif second < first: | |
djs[first] = second | |
# deliberately ignore the case where first == second (same set) | |
def condense_BK(master_list): | |
""" | |
http://stackoverflow.com/a/13716804 | |
""" | |
values = OrderedDict() # maps values to the first sublist | |
# containing them | |
overlaps = {} # maps sublist indexes to each other to form a disjoint set | |
for i, sublist in enumerate(master_list): | |
for v in sublist: | |
if v not in values: # no overlap, so just store value | |
values[v] = i | |
else: # overlap detected, do a disjoint set union | |
disjoint_set_union(overlaps, values[v], i) | |
output = [] # results | |
output_map = {} # map from original indexes to output indexes | |
for v, i, in values.items(): # iterate over values in order | |
root = disjoint_set_find(overlaps, i) | |
if root not in output_map: | |
output_map[i] = len(output) | |
output.append([]) # create new output sublists as necessary | |
output[output_map[root]].append(v) | |
return output | |
def condense_jfs(lists): | |
""" | |
http://stackoverflow.com/a/13715626 | |
""" | |
# remember original positions | |
positions = {} | |
for pos, item in enumerate(chain.from_iterable(lists)): | |
if item not in positions: | |
positions[item] = pos | |
# condense disregarding order | |
sets = _condense_sets(set(L) for L in lists) | |
# restore order | |
result = [sorted(s, key=positions.get) for s in sets] | |
return result | |
def _condense_sets(sets): | |
it = iter(sets) | |
result = [next(it, set())] | |
for candidate in it: | |
for current in result: | |
if candidate & current: # found overlap | |
current |= candidate # combine items added from | |
# candidate may create overlap in yet unseen result | |
# sets (starting from current and to the end of the | |
# result list | |
# call itself recursively to combine such | |
# sets if necessary strictly speaking only r | |
result = _condense_sets(result) | |
break | |
else: # no common elements found | |
result.append(candidate) | |
return result | |
def _condense_sberry(lst): # raises IndexError on lst_BK | |
""" | |
http://stackoverflow.com/a/13715377 | |
""" | |
lookup = {} | |
out = [] | |
index = 0 | |
for grp in lst: | |
keys = [lookup.get(num, None) for num in grp] | |
keys = [key for key in keys if key is not None] | |
if len(keys): | |
if len(set(keys)) != 1: | |
for num in grp: | |
out[keys[0]].append(num) | |
seen = set() | |
keys = [key for key in keys | |
if key not in seen and not seen.add(key)] | |
for key in keys[1:]: | |
out[keys[0]].extend(out[key]) | |
del out[key] | |
seen = set() | |
out[keys[0]] = [item for item in out[keys[0]] | |
if item not in seen and not seen.add(item)] | |
else: | |
for num in grp: | |
lookup[num] = keys[0] | |
out[keys[0]].extend(grp) | |
seen = set() | |
out[keys[0]] = [item for item in out[keys[0]] | |
if item not in seen and not seen.add(item)] | |
else: | |
out.append(grp) | |
for num in grp: | |
lookup[num] = index | |
index += 1 | |
return out | |
def condense_eyquem(passed_list): | |
""" | |
http://stackoverflow.com/a/13716448 | |
""" | |
L = deepcopy(passed_list) | |
dic = OrderedDict() | |
for subl in L: | |
for x in subl: | |
if x not in dic: | |
dic[x] = subl | |
##print 'dic at start' | |
##print '\n'.join('%-3s : %s' % (a,dic[a]) | |
## for a in dic) + '\n' | |
for sublist in L: | |
short = [] | |
short.extend(el for el in sublist | |
if el not in short) | |
seen = [] | |
for k, val in dic.items(): | |
if val is sublist: | |
break | |
if k in short: | |
if val not in seen: | |
seen.append(val) | |
sumseen = [] | |
for elseen in seen: | |
for y in elseen: | |
sumseen.append(y) | |
dic[y] = sumseen | |
if seen: | |
for el in sublist: | |
if el not in sumseen: | |
sumseen.append(el) | |
dic[el] = sumseen | |
sublist[:] = short | |
cumul = [] | |
cumul.extend(lu for lu in dic.values() | |
if lu not in cumul) | |
return cumul | |
def connected_components(edges): | |
neighbors = defaultdict(set) | |
for a, b in edges: | |
neighbors[a].add(b) | |
neighbors[b].add(a) | |
seen = set() | |
def component(node, neighbors=neighbors, seen=seen, see=seen.add): | |
unseen = set([node]) | |
next_unseen = unseen.pop | |
while unseen: | |
node = next_unseen() | |
see(node) | |
unseen |= neighbors[node] - seen | |
yield node | |
for node in neighbors: | |
if node not in seen: | |
yield component(node) | |
def _condense_pillmuncher(lists): | |
""" | |
http://stackoverflow.com/a/13736192/ | |
""" | |
tuples = combinations_with_replacement(enumerate(imap(tuple, lists)), 2) | |
overlapping = ((a, b) for a, b in tuples | |
if not set(a[1]).isdisjoint(b[1])) | |
seen = set() | |
see = seen.add | |
for component in connected_components(overlapping): | |
yield [item for each in sorted(component) | |
for item in each[1] | |
if not (item in seen or see(item))] | |
def _condense_pillmuncher2(lists): | |
""" | |
http://stackoverflow.com/a/13736192/ | |
""" | |
neighbors = defaultdict(set) | |
pos = {} | |
i = 0 | |
for each in lists: | |
for i, a in enumerate(each, i + 1): | |
neighbors[a].update(each) | |
if a not in pos: | |
pos[a] = i | |
seen = set() | |
def component(node, neighbors=neighbors, seen=seen, see=seen.add): | |
unseen = set([node]) | |
next_unseen = unseen.pop | |
while unseen: | |
node = next_unseen() | |
see(node) | |
unseen |= neighbors[node] - seen | |
yield node | |
for node in neighbors: | |
if node not in seen: | |
yield sorted(component(node), key=pos.get) | |
def __condense_pillmuncher(lists): # slow | |
return list(_condense_pillmuncher(lists)) | |
def condense_pillmuncher2(lists): | |
return list(_condense_pillmuncher2(lists)) | |
@accept_funcs | |
def test_funcs(funcs, args, ignore_order=True): | |
"""Check that all functions produce the same result. | |
Each function should accept a readonly sequence as an argument | |
""" | |
orig_args = deepcopy(args) | |
expected = sorted(funcs[0][1](*args)) # pylint: disable=W0142 | |
for name, f in funcs: | |
r = f(*args) # pylint: disable=W0142 | |
if ignore_order: | |
r.sort() | |
assert r == expected, (name, args, expected) | |
assert orig_args == args, name | |
def main(): | |
funcs = get_functions_with_prefix('condense_') | |
for args in [lst_BK[:100]] * 2 + [lst_OP[:100]] * 2: | |
test_funcs(funcs, [args]) | |
measure(funcs, args=[lst_BK], comment='lst_BK') | |
measure(funcs, args=[lst_OP], comment='lst_OP') | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import inspect | |
import sys | |
import timeit | |
from functools import partial, wraps | |
from timeit import default_timer as timer | |
def measure_func(func, args): | |
"""Measure how long `func(*args)` takes in seconds.""" | |
n = 1 | |
f = partial(func, *args) # pylint: disable=W0142 | |
while True: | |
start = timer() | |
r = timeit.repeat(f, number=n, repeat=1) | |
if timer() - start > 1: # at least 1 second per measurement | |
break | |
n *= 2 | |
return min(r + timeit.repeat(f, number=n, repeat=2)) / n | |
def accept_funcs(func): | |
"""Add function names if necessary. | |
Decorator for functions that accept either a sequence of functions | |
or (name, function) pairs. | |
""" | |
@wraps(func) | |
def wrapper(funcs, *args, **kwargs): | |
if hasattr(funcs[0], '__name__'): | |
funcs = [(f.__name__, f) for f in funcs] | |
return func(funcs, *args, **kwargs) | |
return wrapper | |
def human_seconds(seconds, fmt="%.3g %s"): | |
"""Return human-readable string that represents given seconds.""" | |
t = 1e6 * seconds # start with µsec | |
for suff in "usec msec".split(): | |
if t < 1000: | |
return fmt % (t, suff) | |
t /= 1000 | |
return fmt % (t, " sec") | |
@accept_funcs | |
def measure(funcs, args, comment=''): | |
"""Report how long `f(*args)` takes for each f in funcs.""" | |
results = [] | |
w = max(len(name) for name, _ in funcs) | |
for name, f in funcs: | |
results.append((measure_func(f, args), name)) | |
print("{:{}s} {:>9s} {}".format( | |
name, w, human_seconds(results[-1][0]), comment)) | |
print("\n{:{}s} {:>9s} {:>5s} {}".format("name", w, | |
*"time ratio comment".split())) | |
results.sort() | |
mint = results[0][0] | |
for t, name in results: | |
print("{:{}s} {:>9s} {:5.2f} {}".format( | |
name, w, human_seconds(t), t / mint, comment)) | |
print("\n") | |
def get_functions_with_prefix(prefix, module=None): | |
if module is None: # use caller's module | |
modname = inspect.currentframe().f_back.f_globals['__name__'] | |
module = sys.modules[modname] | |
all_funcs = inspect.getmembers(module, inspect.isfunction) | |
return [f for name, f in all_funcs if name.startswith(prefix)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Python 3: | |
name time ratio comment | |
condense_blckknght 1.01 msec 1.00 lst_BK | |
condense_blckknght3 1.42 msec 1.41 lst_BK | |
condense_blckknght2 1.44 msec 1.43 lst_BK | |
condense_pillmuncher2 2.35 msec 2.33 lst_BK | |
condense_BK 4.48 msec 4.44 lst_BK | |
condense_pillmuncher 99.6 msec 98.91 lst_BK | |
condense_eyquem 412 msec 408.52 lst_BK | |
condense_jfs 584 msec 580.10 lst_BK | |
name time ratio comment | |
condense_pillmuncher2 14.2 msec 1.00 lst_OP | |
condense_blckknght 20.3 msec 1.43 lst_OP | |
condense_BK 32.8 msec 2.31 lst_OP | |
condense_jfs 33.6 msec 2.37 lst_OP | |
condense_blckknght2 34.6 msec 2.44 lst_OP | |
condense_blckknght3 35.7 msec 2.51 lst_OP | |
condense_eyquem 236 msec 16.62 lst_OP | |
condense_pillmuncher 58.1 sec 4088.24 lst_OP | |
Python 2: | |
name time ratio comment | |
condense_blckknght 948 usec 1.00 lst_BK | |
condense_blckknght2 1.42 msec 1.50 lst_BK | |
condense_blckknght3 1.48 msec 1.57 lst_BK | |
condense_pillmuncher2 2.13 msec 2.25 lst_BK | |
condense_BK 3.45 msec 3.64 lst_BK | |
condense_pillmuncher 86 msec 90.78 lst_BK | |
condense_eyquem 377 msec 397.38 lst_BK | |
condense_jfs 541 msec 571.26 lst_BK | |
name time ratio comment | |
condense_pillmuncher2 11.6 msec 1.00 lst_OP | |
condense_blckknght 17.6 msec 1.51 lst_OP | |
condense_jfs 28.8 msec 2.47 lst_OP | |
condense_blckknght2 31.4 msec 2.69 lst_OP | |
condense_BK 31.5 msec 2.70 lst_OP | |
condense_blckknght3 33.2 msec 2.85 lst_OP | |
condense_eyquem 162 msec 13.88 lst_OP | |
condense_pillmuncher 49.3 sec 4234.91 lst_OP |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment