pervognsen/compress_hash_displace.py

## compress_hash_displace.py
# Hash, displace, and compress: http://cmph.sourceforge.net/papers/esa09.pdf
# This is expected linear time for any seeded hash function that acts like a random hash function (universality isn't enough).
# (Actually, the code as written is O(n log n) when targeting 100% load. It's O(n) when targeting any smaller load factor.)
# You can make keys_per_bucket higher than the default of 4 but construction time will start to increase dramatically.
# The paper this is based on compresses the seeds (so the fact that the algorithm tries seeds in increasing order is important)
# which brings the representation size close to the information-theoretical minimum. I don't do any of that here, but it could
# be done as a postprocess.
def make_perfect_hash(keys, load_factor=1.0, keys_per_bucket=4, rhash=murmurhash, max_seed=1000000):
    m = int(len(keys) / load_factor)
    r = int(len(keys) / keys_per_bucket)
    buckets = [[] for i in range(r)]
    for k in keys: buckets[rhash(0, k, r)].append(k)
    occupied = set()
    seeds = [0 for i in range(r)]
    for i, bucket in sorted(enumerate(buckets), reverse=True, key=lambda x: len(x[1])):
        for seed in range(max_seed):
            bucket_occupied = set()
            for k in bucket:
                h = rhash(seed, k, m)
                if h in bucket_occupied or h in occupied: break
                bucket_occupied.add(h)
            else:
                occupied.update(bucket_occupied)
                seeds[i] = seed
                break
        else:
            raise ValueError("Failed to find perfect hash function")
    return lambda k: rhash(seeds[rhash(0, k, r)], k, m)

words = open("words.txt").read().splitlines() # ~100,000 words
perfect_hash = make_perfect_hash(words) # ~30 seconds
assert sorted(perfect_hash(word) for word in words) == list(range(len(words)))
	# Hash, displace, and compress: http://cmph.sourceforge.net/papers/esa09.pdf
	# This is expected linear time for any seeded hash function that acts like a random hash function (universality isn't enough).
	# (Actually, the code as written is O(n log n) when targeting 100% load. It's O(n) when targeting any smaller load factor.)
	# You can make keys_per_bucket higher than the default of 4 but construction time will start to increase dramatically.
	# The paper this is based on compresses the seeds (so the fact that the algorithm tries seeds in increasing order is important)
	# which brings the representation size close to the information-theoretical minimum. I don't do any of that here, but it could
	# be done as a postprocess.
	def make_perfect_hash(keys, load_factor=1.0, keys_per_bucket=4, rhash=murmurhash, max_seed=1000000):
	m = int(len(keys) / load_factor)
	r = int(len(keys) / keys_per_bucket)
	buckets = [[] for i in range(r)]
	for k in keys: buckets[rhash(0, k, r)].append(k)
	occupied = set()
	seeds = [0 for i in range(r)]
	for i, bucket in sorted(enumerate(buckets), reverse=True, key=lambda x: len(x[1])):
	for seed in range(max_seed):
	bucket_occupied = set()
	for k in bucket:
	h = rhash(seed, k, m)
	if h in bucket_occupied or h in occupied: break
	bucket_occupied.add(h)
	else:
	occupied.update(bucket_occupied)
	seeds[i] = seed
	break
	else:
	raise ValueError("Failed to find perfect hash function")
	return lambda k: rhash(seeds[rhash(0, k, r)], k, m)

	words = open("words.txt").read().splitlines() # ~100,000 words
	perfect_hash = make_perfect_hash(words) # ~30 seconds
	assert sorted(perfect_hash(word) for word in words) == list(range(len(words)))