Last active
February 19, 2019 08:58
-
-
Save mfornet/32eb5f13b155f9e8c516b408c29e08fb to your computer and use it in GitHub Desktop.
What if bloom filters use counters instead of bitmasks?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Exploring bloom filters using counters instead of bit masks. | |
""" | |
from random import sample, seed, randint | |
from hashlib import md5 | |
from functools import partial | |
# TODO: Use more random & faster hash functions | |
def hash_md5(A, seed, mod): | |
l = list(md5(str(A + (seed << 10)).encode()).digest()) | |
num = 0 | |
for x in l: | |
num = (num * 256 + x) % mod | |
return num | |
def get_hashers(num_hashes, filter_size): | |
""" | |
Get `num_hashes` functions H_i such that are independents and distribute uniformly in the range [0, k) | |
""" | |
hashers = [] | |
for i in range(num_hashes): | |
hashers.append(partial(hash_md5, seed=i, mod=filter_size)) | |
return hashers | |
def bloom_filter(cur_set, filter_size, hashers): | |
bloom_filter = [0] * filter_size | |
for e in cur_set: | |
for h in hashers: | |
bloom_filter[h(e)] += 1 | |
return bloom_filter | |
def accept(freqs, threshold, isin): | |
# Found best predicate to accept values from freqs | |
print(isin, sorted(freqs)) | |
return min(freqs) <= threshold | |
def compute_diff(cur_set, bloom_filter, hashers, set_size, threshold, target_set): | |
# Target set is not used to compute diffs | |
diff = set() | |
for e in cur_set: | |
freqs = [] | |
for h in hashers: | |
freqs.append(bloom_filter[h(e)]) | |
if accept(freqs, threshold, e in target_set): | |
diff.add(e) | |
return diff | |
def setup(set_size, intersection_size, filter_size, num_hashes, threshold): | |
# seed(0) | |
hashers = get_hashers(num_hashes, filter_size) | |
# Simulate intersection | |
p0 = set(sample(range(10**9), set_size)) | |
p1 = set(sample(p0, intersection_size)) | |
while len(p1) < set_size: | |
x = randint(0, 10**9 - 1) | |
if not x in p0: | |
p1.add(x) | |
bf0 = bloom_filter(p0, filter_size, hashers) | |
output = set(compute_diff(p1, bf0, hashers, set_size, threshold, p0)) | |
answer = set([e for e in p1 if not e in p0]) | |
correct_elements = 0 | |
extra_elements = 0 | |
missing_elements = len(answer) | |
skipped = set_size - len(output) | |
for e in output: | |
if e in answer: | |
correct_elements += 1 | |
missing_elements -= 1 | |
else: | |
extra_elements += 1 | |
return correct_elements, extra_elements, missing_elements, skipped | |
if __name__ == '__main__': | |
correct, extra, missing, skipped = setup(100, 95, 100, 2, 2) | |
print(f"Correct: {correct} Extra: {extra} Missing: {missing} Skipped: {skipped}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment