Skip to content

Instantly share code, notes, and snippets.

@pervognsen
Last active September 20, 2022 15:16
Show Gist options
  • Star 11 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pervognsen/b21f6dd13f4bcb4ff2123f0d78fcfd17 to your computer and use it in GitHub Desktop.
Save pervognsen/b21f6dd13f4bcb4ff2123f0d78fcfd17 to your computer and use it in GitHub Desktop.
# Hash, displace, and compress: http://cmph.sourceforge.net/papers/esa09.pdf
# This is expected linear time for any seeded hash function that acts like a random hash function (universality isn't enough).
# (Actually, the code as written is O(n log n) when targeting 100% load. It's O(n) when targeting any smaller load factor.)
# You can make keys_per_bucket higher than the default of 4 but construction time will start to increase dramatically.
# The paper this is based on compresses the seeds (so the fact that the algorithm tries seeds in increasing order is important)
# which brings the representation size close to the information-theoretical minimum. I don't do any of that here, but it could
# be done as a postprocess.
def make_perfect_hash(keys, load_factor=1.0, keys_per_bucket=4, rhash=murmurhash, max_seed=1000000):
m = int(len(keys) / load_factor)
r = int(len(keys) / keys_per_bucket)
buckets = [[] for i in range(r)]
for k in keys: buckets[rhash(0, k, r)].append(k)
occupied = set()
seeds = [0 for i in range(r)]
for i, bucket in sorted(enumerate(buckets), reverse=True, key=lambda x: len(x[1])):
for seed in range(max_seed):
bucket_occupied = set()
for k in bucket:
h = rhash(seed, k, m)
if h in bucket_occupied or h in occupied: break
bucket_occupied.add(h)
else:
occupied.update(bucket_occupied)
seeds[i] = seed
break
else:
raise ValueError("Failed to find perfect hash function")
return lambda k: rhash(seeds[rhash(0, k, r)], k, m)
words = open("words.txt").read().splitlines() # ~100,000 words
perfect_hash = make_perfect_hash(words) # ~30 seconds
assert sorted(perfect_hash(word) for word in words) == list(range(len(words)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment