-
-
Save bartdegoede/42ef7a265d946a9a75617a89ecbaf674 to your computer and use it in GitHub Desktop.
Basic Bloom filter implementation in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mmh3 | |
class Bloomfilter(object): | |
def __init__(self, m=15, k=3): | |
self.m = m | |
self.k = k | |
# we use a list of Booleans to represent our bit array for simplicity | |
self.bit_array = [False for i in range(self.m)] | |
def __repr__(self): | |
return '<Bloomfilter size: {}>'.format(self.m) | |
def add(self, element): | |
""" | |
Add an element to the filter. Murmurhash3 gives us hash values | |
distributed uniformly enough we can use different seeds to | |
represent different hash functions | |
""" | |
for i in range(self.k): | |
# this will give us a number between 0 and m - 1 | |
digest = mmh3.hash(element, i, signed=False) % self.m | |
self.bit_array[digest] = True | |
def check(self, element): | |
""" | |
To check whether element is in the filter, we hash the element with | |
the same hash functions as the add functions (using the seed). If one | |
of them doesn't occur in our bit_array, the element is not in there | |
(only a value that hashes to all of the same indices we've already | |
seen before). | |
""" | |
for i in range(self.k): | |
digest = mmh3.hash(element, i, signed=False) % self.m | |
if self.bit_array[digest] == False: | |
# if any of the bits hasn't been set, then it's not in | |
# the filter | |
return False | |
return True | |
if __name__ == '__main__': | |
bf = Bloomfilter() | |
bf.add('hello') | |
bf.add('world') | |
assert(bf.check('hello') == True) | |
assert(bf.check('hello world') == False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment