Skip to content

Instantly share code, notes, and snippets.

@mfornet
Last active February 19, 2019 08:58
Show Gist options
  • Save mfornet/32eb5f13b155f9e8c516b408c29e08fb to your computer and use it in GitHub Desktop.
Save mfornet/32eb5f13b155f9e8c516b408c29e08fb to your computer and use it in GitHub Desktop.
What if bloom filters use counters instead of bitmasks?
"""
Exploring bloom filters using counters instead of bit masks.
"""
from random import sample, seed, randint
from hashlib import md5
from functools import partial
# TODO: Use more random & faster hash functions
def hash_md5(A, seed, mod):
l = list(md5(str(A + (seed << 10)).encode()).digest())
num = 0
for x in l:
num = (num * 256 + x) % mod
return num
def get_hashers(num_hashes, filter_size):
"""
Get `num_hashes` functions H_i such that are independents and distribute uniformly in the range [0, k)
"""
hashers = []
for i in range(num_hashes):
hashers.append(partial(hash_md5, seed=i, mod=filter_size))
return hashers
def bloom_filter(cur_set, filter_size, hashers):
bloom_filter = [0] * filter_size
for e in cur_set:
for h in hashers:
bloom_filter[h(e)] += 1
return bloom_filter
def accept(freqs, threshold, isin):
# Found best predicate to accept values from freqs
print(isin, sorted(freqs))
return min(freqs) <= threshold
def compute_diff(cur_set, bloom_filter, hashers, set_size, threshold, target_set):
# Target set is not used to compute diffs
diff = set()
for e in cur_set:
freqs = []
for h in hashers:
freqs.append(bloom_filter[h(e)])
if accept(freqs, threshold, e in target_set):
diff.add(e)
return diff
def setup(set_size, intersection_size, filter_size, num_hashes, threshold):
# seed(0)
hashers = get_hashers(num_hashes, filter_size)
# Simulate intersection
p0 = set(sample(range(10**9), set_size))
p1 = set(sample(p0, intersection_size))
while len(p1) < set_size:
x = randint(0, 10**9 - 1)
if not x in p0:
p1.add(x)
bf0 = bloom_filter(p0, filter_size, hashers)
output = set(compute_diff(p1, bf0, hashers, set_size, threshold, p0))
answer = set([e for e in p1 if not e in p0])
correct_elements = 0
extra_elements = 0
missing_elements = len(answer)
skipped = set_size - len(output)
for e in output:
if e in answer:
correct_elements += 1
missing_elements -= 1
else:
extra_elements += 1
return correct_elements, extra_elements, missing_elements, skipped
if __name__ == '__main__':
correct, extra, missing, skipped = setup(100, 95, 100, 2, 2)
print(f"Correct: {correct} Extra: {extra} Missing: {missing} Skipped: {skipped}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment