Skip to content

Instantly share code, notes, and snippets.

@fabiocerqueira
Last active October 14, 2019 20:30
Show Gist options
  • Save fabiocerqueira/975c2096b91dae4c9770607c862f1c2e to your computer and use it in GitHub Desktop.
Save fabiocerqueira/975c2096b91dae4c9770607c862f1c2e to your computer and use it in GitHub Desktop.
Example BloomFilter implementation in Python
from hashlib import md5
import redis
class BloomFilterBaseBackend:
def initialize(self, size):
self.size = size
def validate(self, position):
if position < 0 or position >= self.size:
raise IndexError(
f"Invalid position {position}, min: 0, max: {self.size - 1}"
)
def set_bits(self, positions):
raise NotImplementedError()
def get_bit(self, positions):
raise NotImplementedError()
class BloomFilterSyncRedisBackend(BloomFilterBaseBackend):
def __init__(self, key, host="localhost", port=6379, db=0):
self.connection = redis.Redis(host, port, db)
self.key = key
def set_bits(self, positions):
pipe = self.connection.pipeline()
for p in positions:
self.validate(p)
pipe.setbit(self.key, p, 1)
pipe.execute()
def get_bits(self, positions):
pipe = self.connection.pipeline()
for p in positions:
self.validate(p)
pipe.getbit(self.key, p)
values = pipe.execute()
out = dict(zip(positions, values))
return out
class BloomFilterMemoryBackend(BloomFilterBaseBackend):
def initialize(self, size):
super().initialize(size)
self.bitarray = 1 << self.size
def set_bits(self, positions):
for p in positions:
self.validate(p)
self.bitarray |= 1 << p
def get_bits(self, positions):
out = {}
for p in positions:
self.validate(p)
value = self.bitarray & (1 << p)
out[p] = 1 if value else 0
return out
def python_hash(key):
return hash(key)
def md5_hash(key):
# md5 is not recommended, but it is just for testing...
return hash(md5(key.encode()).hexdigest())
class BloomFilter:
def __init__(self, size=10000, cache=None, hash_functions=None):
self.size = size
self.count = 0
if cache is None:
cache = BloomFilterMemoryBackend()
self.cache = cache
self.cache.initialize(size)
if hash_functions is None:
self.hash_functions = [python_hash, md5_hash]
def _get_positions(self, key):
return [hash_func(key) % self.size for hash_func in self.hash_functions]
def add(self, key):
if self.count >= self.size:
raise IndexError("BloomFilter is full")
positions = self._get_positions(key)
self.cache.set_bits(positions)
self.count += 1
def check(self, key):
positions = self._get_positions(key)
return all(self.cache.get_bits(positions).values())
def __contains__(self, key):
return self.check(key)
def __len__(self):
return self.count
if __name__ == "__main__":
print("BloomFilter in-memory")
in_memo_bf = BloomFilter()
in_memo_bf.add("babs")
print(f"babs in in_memo_bf = {'babs' in in_memo_bf}")
print(f"fabio in in_memo_bf = {'fabio' in in_memo_bf}")
print("-" * 50)
print("BloomFilter using sync redis with mykey")
bf = BloomFilter(cache=BloomFilterSyncRedisBackend("mykey"))
bf.add("fabio")
print(f"fabio in bf = {'fabio' in bf}")
bf.add("pedro")
print(f"pedro in bf = {'pedro' in bf}")
print(f"xuxu in bf = {'xuxu' in bf}")
print("-" * 50)
print("BloomFilter using sync redis with anotherkey")
bf2 = BloomFilter(cache=BloomFilterSyncRedisBackend("anotherkey"))
print(f"fabio in bf2 = {'fabio' in bf2}")
bf2.add("xuxu")
print(f"xuxu in bf2 = {'xuxu' in bf2}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment