Skip to content

Instantly share code, notes, and snippets.

@wrgoldstein
Last active May 20, 2023 12:54
Show Gist options
  • Save wrgoldstein/90ab82ab31d97fbff0b49a37b60efabf to your computer and use it in GitHub Desktop.
Save wrgoldstein/90ab82ab31d97fbff0b49a37b60efabf to your computer and use it in GitHub Desktop.
import time
from random import randint
# make some dumb fake data
def make_row(r):
r = randint(1, 200)
s = [randint(r - min(r - 1, 15), r + 15) for _ in range(randint(1,6))]
return [r, s]
rows = [make_row(r) for r in range(500_000)]
# perform the set operations
lookup = {}
t = time.time()
for [i, aliases] in rows:
aliases = set(aliases)
for canonical_id, canonical_set in lookup.items():
if aliases.intersection(canonical_set):
lookup[canonical_id] = canonical_set.union(aliases)
break
else:
lookup[i] = aliases
# map from original id to canonical id
dict([[x, min(s)] for s in lookup.values() for x in s])
time.time() - t
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment