Skip to content

Instantly share code, notes, and snippets.

@zed
Created December 6, 2012 07:42
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zed/4babbce1179d77272b68 to your computer and use it in GitHub Desktop.
Save zed/4babbce1179d77272b68 to your computer and use it in GitHub Desktop.
measure performance of various solution for "Replace list of list with “condensed” list of list while maintaining order" problem
#!/usr/bin/env python
import sys
from collections import OrderedDict, defaultdict
from copy import deepcopy
from itertools import chain, combinations_with_replacement, repeat
from random import randrange
imap = map # Python 3
try:
from itertools import imap
except ImportError:
pass
from reporttime import accept_funcs, get_functions_with_prefix, measure
# input data for testing, performance measurements
_a = [1, 2, 3]
_b = [3, 4]
_i = [21, 22]
_c = [88, 7, 8]
_e = [5, 4]
_d = [3, 50]
_f = [8, 9]
_g = [9, 10]
_h = [20, 21]
lst_OP = [_a, _b, _c, _i, _e, _d, _f, _g, _h, _a, _c, _i] * 1000
lst_BK = ([list(range(4 * ii, 4 * (ii + 1) - 1))
for ii in range(300)] +
[list(range(4 * ii + 2, 4 * (ii + 1) + 1))
for ii in range(300)])
lst_rnd = [[randrange(10 ** 3) for _ in repeat(None, randrange(100))]
for _ in repeat(None, 1000)]
def condense_blckknght(master_list):
"""
http://stackoverflow.com/a/13716804
"""
values = {}
overlaps = {}
for i, sublist in enumerate(master_list):
for v in sublist:
if v in values:
current = i
while current in overlaps:
current = overlaps[current]
other = values[v]
while other in overlaps:
other = overlaps[other]
if current < other:
overlaps[other] = current
elif other < current:
overlaps[current] = other
# else: #current == other: # do nothing
else:
values[v] = i
output = []
output_map = {}
for i, sublist in enumerate(master_list):
if i not in overlaps:
output_map[i] = len(output)
output.append(sublist[:])
else:
root = overlaps[i]
while root in overlaps:
root = overlaps[root]
for v in sublist:
if values[v] == i:
output[output_map[root]].append(v)
return output
def disjoint_set_find(djs, node): # disjoint set find, with path
# compression
if node not in djs: # base case, node is a root of a set
return node
djs[node] = disjoint_set_find(djs, djs[node]) # recurse, caching results
return djs[node]
def condense_blckknght2(master_list):
"""
http://stackoverflow.com/a/13716804
"""
values = {} # maps each value to the first sublist that contains it
overlaps = {} # maps sublist indexes to each other to form a disjoint set
for i, sublist in enumerate(master_list):
for v in sublist:
if v not in values: # no overlap, so just store value
values[v] = i
else: # overlap detected, do a disjoint set union
current_set = disjoint_set_find(overlaps, i)
other_set = disjoint_set_find(overlaps, values[v])
if current_set < other_set:
overlaps[other_set] = current_set
elif other_set < current_set:
overlaps[current_set] = other_set
# ignore case where sets are equal
output = [] # results
output_map = {} # map from original indexes to output indexes
for i, sublist in enumerate(master_list):
if i not in overlaps: # this is a root of a disjoint set
output_map[i] = len(output) # update index output mapping
output.append(sublist[:]) # copy sublist into output * FIXME
else:
root = disjoint_set_find(overlaps, i)
for v in sublist: # add our unique elements into output list
if values[v] == i: # keep only in the first list * FIXME
output[output_map[root]].append(v)
return output
def condense_blckknght3(master_list):
"""
http://stackoverflow.com/a/13716804
"""
values = {} # maps each value to the first sublist that contains it
overlaps = {} # maps sublist indexes to each other to form a disjoint set
for i, sublist in enumerate(master_list):
for v in sublist:
if v not in values: # no overlap, so just store value
values[v] = i
else: # overlap detected, do a disjoint set union
current_set = disjoint_set_find(overlaps, i)
other_set = disjoint_set_find(overlaps, values[v])
if current_set < other_set:
overlaps[other_set] = current_set
elif other_set < current_set:
overlaps[current_set] = other_set
# ignore case where sets are equal
output = [] # results
index = {} # map from original indexes to output indexes
for i, sublist in enumerate(master_list):
if i not in overlaps: # this is a root of a disjoint set
index[i] = len(output) # update index output mapping
output.append([])
append = output[index[i]].append
else:
root = disjoint_set_find(overlaps, i)
append = output[index[root]].append
# copy sublist into output removing duplicates
for v in sublist: # add our unique elements into output list
if values[v] is not None: # not seen, add to output
append(v)
values[v] = None # mark as already seen
return output
def disjoint_set_union(djs, first, second):
first = disjoint_set_find(djs, first) # find root of first set
second = disjoint_set_find(djs, second) # and of second set
if first < second: # make smaller root the root of the new combined set
djs[second] = first
elif second < first:
djs[first] = second
# deliberately ignore the case where first == second (same set)
def condense_BK(master_list):
"""
http://stackoverflow.com/a/13716804
"""
values = OrderedDict() # maps values to the first sublist
# containing them
overlaps = {} # maps sublist indexes to each other to form a disjoint set
for i, sublist in enumerate(master_list):
for v in sublist:
if v not in values: # no overlap, so just store value
values[v] = i
else: # overlap detected, do a disjoint set union
disjoint_set_union(overlaps, values[v], i)
output = [] # results
output_map = {} # map from original indexes to output indexes
for v, i, in values.items(): # iterate over values in order
root = disjoint_set_find(overlaps, i)
if root not in output_map:
output_map[i] = len(output)
output.append([]) # create new output sublists as necessary
output[output_map[root]].append(v)
return output
def condense_jfs(lists):
"""
http://stackoverflow.com/a/13715626
"""
# remember original positions
positions = {}
for pos, item in enumerate(chain.from_iterable(lists)):
if item not in positions:
positions[item] = pos
# condense disregarding order
sets = _condense_sets(set(L) for L in lists)
# restore order
return [sorted(s, key=positions.get) for s in sets] or [[]]
def _condense_sets(sets):
result = []
for candidate in sets:
for current in result:
if candidate & current: # found overlap
current |= candidate # combine (merge sets)
# new items from candidate may create an overlap
# between current set and the remaining result sets
result = _condense_sets(result) # merge such sets
break
else: # no common elements found (or result is empty)
result.append(candidate)
return result
def _condense_sberry(lst): # raises IndexError on lst_BK
"""
http://stackoverflow.com/a/13715377
"""
lookup = {}
out = []
index = 0
for grp in lst:
keys = [lookup.get(num, None) for num in grp]
keys = [key for key in keys if key is not None]
if len(keys):
if len(set(keys)) != 1:
for num in grp:
out[keys[0]].append(num)
seen = set()
keys = [key for key in keys
if key not in seen and not seen.add(key)]
for key in keys[1:]:
out[keys[0]].extend(out[key])
del out[key]
seen = set()
out[keys[0]] = [item for item in out[keys[0]]
if item not in seen and not seen.add(item)]
else:
for num in grp:
lookup[num] = keys[0]
out[keys[0]].extend(grp)
seen = set()
out[keys[0]] = [item for item in out[keys[0]]
if item not in seen and not seen.add(item)]
else:
out.append(grp)
for num in grp:
lookup[num] = index
index += 1
return out
def __condense_eyquem(passed_list): # fail tests
"""
http://stackoverflow.com/a/13716448
"""
L = deepcopy(passed_list)
dic = OrderedDict()
for subl in L:
for x in subl:
if x not in dic:
dic[x] = subl
##print 'dic at start'
##print '\n'.join('%-3s : %s' % (a,dic[a])
## for a in dic) + '\n'
for sublist in L:
short = []
short.extend(el for el in sublist
if el not in short)
seen = []
for k, val in dic.items():
if val is sublist:
break
if k in short:
if val not in seen:
seen.append(val)
sumseen = []
for elseen in seen:
for y in elseen:
sumseen.append(y)
dic[y] = sumseen
if seen:
for el in sublist:
if el not in sumseen:
sumseen.append(el)
dic[el] = sumseen
sublist[:] = short
cumul = []
cumul.extend(lu for lu in dic.values()
if lu not in cumul)
return cumul
def connected_components(edges):
neighbors = defaultdict(set)
for a, b in edges:
neighbors[a].add(b)
neighbors[b].add(a)
seen = set()
def component(node, neighbors=neighbors, seen=seen, see=seen.add):
unseen = set([node])
next_unseen = unseen.pop
while unseen:
node = next_unseen()
see(node)
unseen |= neighbors[node] - seen
yield node
for node in neighbors:
if node not in seen:
yield component(node)
def _condense_pillmuncher(lists):
"""
http://stackoverflow.com/a/13736192/
"""
tuples = combinations_with_replacement(enumerate(imap(tuple, lists)), 2)
overlapping = ((a, b) for a, b in tuples
if not set(a[1]).isdisjoint(b[1]))
seen = set()
see = seen.add
for component in connected_components(overlapping):
yield [item for each in sorted(component)
for item in each[1]
if not (item in seen or see(item))]
def _condense_pillmuncher2(lists):
"""
http://stackoverflow.com/a/13736192/
"""
neighbors = defaultdict(set)
pos = {}
i = 0
for each in lists:
for i, a in enumerate(each, i + 1):
neighbors[a].update(each)
if a not in pos:
pos[a] = i
seen = set()
def component(node, neighbors=neighbors, seen=seen, see=seen.add):
unseen = set([node])
next_unseen = unseen.pop
while unseen:
node = next_unseen()
see(node)
unseen |= neighbors[node] - seen
yield node
for node in neighbors:
if node not in seen:
yield sorted(component(node), key=pos.get)
def __condense_pillmuncher(lists): # slow
return list(_condense_pillmuncher(lists))
def condense_pillmuncher2(lists):
return list(_condense_pillmuncher2(lists))
def condense_hynekcer(lists):
"""
http://stackoverflow.com/a/13896383
"""
groups = {} # items divided into groups {id(the_set): the_set,...}
members = {} # mapping from item to group
positions = {} # mapping from item to sequential ordering
iposition = 0 # counter for positions
for sublist in lists:
if not sublist or members.get(sublist[0], set()).issuperset(sublist):
continue # speed-up condition if all is from one group
common = set([x for x in sublist if x in members])
if common: # any common group can be a destination for merge
dst_group = members[common.pop()]
common = common - dst_group # are more groups common for sublist?
while common:
grp = members[common.pop()]
if len(grp) > len(dst_group): # merge shorter into longer grp
grp, dst_group = dst_group, grp
dst_group.update(grp)
for item in grp:
members[item] = dst_group
del groups[id(grp)]
common = common - dst_group
else: # or create a new group if nothing common
dst_group = set()
groups[id(dst_group)] = dst_group
newitems = []
for item in sublist: # merge also new items
if item not in positions:
positions[item] = iposition
iposition += 1
newitems.append(item)
members[item] = dst_group
dst_group.update(newitems)
return [sorted(x, key=positions.get) for x in groups.values()]
def _condense_hynekcer2(lists):
"""
http://stackoverflow.com/a/13896383
"""
neighbors = defaultdict(set) # mapping from items to sublists
positions = {} # mapping from items to sequential ordering
position = 0
for each in lists:
for item in each:
neighbors[item].update(each)
if item not in positions:
positions[item] = position
position += 1
seen = set()
see = seen.add
for node in neighbors:
if node not in seen:
unseen = set([node]) # this is a "todo" set
next_unseen = unseen.pop # method alias, not called now
group = [] # collects the output
while unseen:
node = next_unseen()
see(node)
unseen |= neighbors[node] - seen
group.append(node)
yield sorted(group, key=positions.get)
def condense_hynekcer2(lists):
return list(_condense_hynekcer2(lists))
def __condense_qarma(input): # fail tests
"""
http://stackoverflow.com/a/13951145
"""
def unique(l):
s = set()
for i in l:
if i not in s:
s.add(i)
yield(i)
tmp = list(unique(sum(input, [])))
tmp = filter(None, [filter(lambda x: isinstance(x, str), tmp),
filter(lambda x: not isinstance(x, str), tmp)])
# if only numbers or only strings are present, flatten the list
if len(tmp) == 1:
tmp = tmp[0]
return tmp
@accept_funcs
def test_funcs(funcs, args, ignore_order=True):
"""Check that all functions produce the same result.
Each function should accept a readonly sequence as an argument
"""
orig_args = deepcopy(args)
expected = sorted(funcs[0][1](*args)) # pylint: disable=W0142
for name, f in funcs:
r = f(*args) # pylint: disable=W0142
if ignore_order:
r.sort()
assert r == expected, (name, args, expected)
assert orig_args == args, name
def main():
mod = sys.modules[__name__]
funcs = get_functions_with_prefix('condense_', module=mod)
for args in [lst_BK] * 2 + [lst_OP] * 2:
test_funcs(funcs, [args])
verbose = '--verbose' in sys.argv
for name in "lst_OP lst_BK lst_rnd".split():
lst = getattr(mod, name)
measure(funcs, args=[lst], comment=name, verbose=verbose)
if __name__ == "__main__":
main()
# -*- coding: utf-8 -*-
import inspect
import sys
import timeit
from functools import partial, wraps
from timeit import default_timer as timer
def measure_func(func, args):
"""Measure how long `func(*args)` takes in seconds."""
n = 1
f = partial(func, *args) # pylint: disable=W0142
while True:
start = timer()
r = timeit.repeat(f, number=n, repeat=1)
if timer() - start > 1: # at least 1 second per measurement
break
n *= 2
return min(r + timeit.repeat(f, number=n, repeat=2)) / n
def accept_funcs(func):
"""Add function names if necessary.
Decorator for functions that accept either a sequence of functions
or (name, function) pairs.
"""
@wraps(func)
def wrapper(funcs, *args, **kwargs):
if hasattr(funcs[0], '__name__'):
funcs = [(f.__name__, f) for f in funcs]
return func(funcs, *args, **kwargs)
return wrapper
def human_seconds(seconds, fmt="%.3g %s"):
"""Return human-readable string that represents given seconds."""
t = 1e6 * seconds # start with µsec
for suff in "usec msec".split():
if t < 1000:
return fmt % (t, suff)
t /= 1000
return fmt % (t, " sec")
@accept_funcs
def measure(funcs, args, comment='', verbose=False):
"""Report how long `f(*args)` takes for each f in funcs."""
# measure performance
results = []
w = max(len(name) for name, _ in funcs)
for name, f in funcs:
results.append((measure_func(f, args), name))
if verbose:
print("{:{}s} {:>9s} {}".format(
name, w, human_seconds(results[-1][0]), comment))
# print sorted results
results.sort()
mint = results[0][0] # minimal time
ratios = ["%5.2f" % (t / mint,) for t, _ in results]
maxratio_width = max(len(r) for r in ratios)
# header
print("{:{}s} {:>9s} {:>{}s} {}".format(
"name", w, "time", "ratio", maxratio_width, "comment"))
ratios = [s.rjust(maxratio_width) for s in ratios]
for (t, name), ratio in zip(results, ratios):
print("{:{}s} {:>9s} {} {}".format(
name, w, human_seconds(t), ratio, comment))
def get_functions_with_prefix(prefix, module=None):
if module is None: # use caller's module
modname = inspect.currentframe().f_back.f_globals['__name__']
module = sys.modules[modname]
all_funcs = inspect.getmembers(module, inspect.isfunction)
return [f for name, f in all_funcs if name.startswith(prefix)]
name time ratio comment
condense_hynekcer 5.79 msec 1.00 lst_OP
condense_hynekcer2 7.4 msec 1.28 lst_OP
condense_pillmuncher2 11.5 msec 1.99 lst_OP
condense_blckknght 16.8 msec 2.91 lst_OP
condense_jfs 26 msec 4.49 lst_OP
condense_BK 30.5 msec 5.28 lst_OP
condense_blckknght2 30.9 msec 5.34 lst_OP
condense_blckknght3 32.5 msec 5.62 lst_OP
name time ratio comment
condense_blckknght 964 usec 1.00 lst_BK
condense_blckknght2 1.41 msec 1.47 lst_BK
condense_blckknght3 1.46 msec 1.51 lst_BK
condense_hynekcer2 1.95 msec 2.03 lst_BK
condense_pillmuncher2 2.1 msec 2.18 lst_BK
condense_hynekcer 2.12 msec 2.20 lst_BK
condense_BK 3.39 msec 3.52 lst_BK
condense_jfs 544 msec 563.66 lst_BK
name time ratio comment
condense_hynekcer 6.86 msec 1.00 lst_rnd
condense_jfs 16.8 msec 2.44 lst_rnd
condense_blckknght 28.6 msec 4.16 lst_rnd
condense_blckknght2 56.1 msec 8.18 lst_rnd
condense_blckknght3 56.3 msec 8.21 lst_rnd
condense_BK 70.2 msec 10.23 lst_rnd
condense_pillmuncher2 324 msec 47.22 lst_rnd
condense_hynekcer2 334 msec 48.61 lst_rnd
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment