Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@deliro
Created January 18, 2017 11:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save deliro/cc00aca6b8b3ef0a9fd18d2ee1ef30d2 to your computer and use it in GitHub Desktop.
Save deliro/cc00aca6b8b3ef0a9fd18d2ee1ef30d2 to your computer and use it in GitHub Desktop.
Bloom filter on pure python
from math import log
from array import array
from struct import unpack
from hashlib import sha512
class Bits:
__slots__ = ('data',)
def __init__(self, cap):
self.data = array('Q', (0 for _ in range((cap // 64) + 1)))
def __setitem__(self, i, x):
byte, bit = divmod(i, 64)
self.data[byte] |= 1 << bit
def __getitem__(self, i):
byte, bit = divmod(i, 64)
return (self.data[byte] & (1 << bit)) >> bit
class Bloom:
__slots__ = ('bit_num', 'probes', 'bits')
def __init__(self, capacity, error=0.001):
self.bit_num = int(-(capacity * log(error)) / 0.4804)
self.probes = int(0.6931 * (self.bit_num / capacity))
self.bits = Bits(self.bit_num)
def _hash(self, obj):
if not isinstance(obj, str):
obj = str(obj)
hashes = unpack('Q'*8, sha512(obj.encode()).digest())
m = 0
l = len(hashes)
for i in range(self.probes):
h = hashes[i % l]
if i and i % l == 0:
m += 1
yield (h >> m) % self.bit_num
def add(self, item):
for h in self._hash(item):
self.bits[h] = 1
def __contains__(self, item):
return all(self.bits[h] for h in self._hash(item))
if __name__ == '__main__':
b = Bloom(10**6)
for i in range(5000):
b.add(i)
for i in range(5000):
assert i in b
errors = sum(i in b for i in range(10**7, 10**7+5000))
print('Error:', errors/5000)
print('Errors:', errors)
print('Bits number:', b.bit_num)
print('Size of bit array:', b.bits.data.__sizeof__())
print('Probes:', b.probes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment