Created
June 8, 2012 19:04
-
-
Save josephkern/2897618 to your computer and use it in GitHub Desktop.
A Simple Bloom Filter in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""A simple Bloom Filter implementation | |
Calculating optimal filter size: | |
Where: | |
m is: self.bitcount (how many bits in self.filter) | |
n is: the number of expected values added to self.filter | |
k is: the number of hashes being produced | |
(1 - math.exp(-float(k * n) / m)) ** k | |
http://en.wikipedia.org/wiki/Bloom_filter | |
""" | |
from hashlib import sha256 | |
class BloomFilter(object): | |
"""A simple bloom filter for lots of int()""" | |
def __init__(self, array_size=(1 * 1024), hashes=13): | |
"""Initializes a BloomFilter() object: | |
Expects: | |
array_size (in bytes): 4 * 1024 for a 4KB filter | |
hashes (int): for the number of hashes to perform""" | |
self.filter = bytearray(array_size) # The filter itself | |
self.bitcount = array_size * 8 # Bits in the filter | |
self.hashes = hashes # The number of hashes to use | |
def _hash(self, value): | |
"""Creates a hash of an int and yields a generator of hash functions | |
Expects: | |
value: int() | |
Yields: | |
generator of ints()""" | |
# Build an int() around the sha256 digest of int() -> value | |
value = value.__str__() # Comment out line if you're filtering strings() | |
digest = int(sha256(value).hexdigest(), 16) | |
for _ in range(self.hashes): | |
# bitwise AND of the digest and all of the available bit positions | |
# in the filter | |
yield digest & (self.bitcount - 1) | |
# Shift bits in digest to the right, based on 256 (in sha256) | |
# divided by the number of hashes needed be produced. | |
# Rounding the result by using int(). | |
# So: digest >>= (256 / 13) would shift 19 bits to the right. | |
digest >>= (256 / self.hashes) | |
def add(self, value): | |
"""Bitwise OR to add value(s) into the self.filter | |
Expects: | |
value: generator of digest ints() | |
""" | |
for digest in self._hash(value): | |
# In-place bitwise OR of the filter, position is determined | |
# by the (digest / 8) digest is described above in self._hash() | |
# Bitwise OR is undertaken on the value at the location and | |
# 2 to the power of digest modulo 8. Ex: 2 ** (30034 % 8) | |
# to grantee the value is <= 128, the bytearray not being able | |
# to store a value >= 256. Q: Why not use ((modulo 9) -1) then? | |
self.filter[(digest / 8)] |= (2 ** (digest % 8)) | |
# The purpose here is to spread out the hashes to create a unique | |
# "fingerprint" with unique locations in the filter array, | |
# rather than just a big long hash blob. | |
def query(self, value): | |
"""Bitwise AND to query values in self.filter | |
Expects: | |
value: value to check filter against (assumed int())""" | |
# If all() hashes return True from a bitwise AND (the opposite | |
# described above in self.add()) for each digest returned from | |
# self._hash return True, else False | |
return all(self.filter[(digest / 8)] & (2 ** (digest % 8)) | |
for digest in self._hash(value)) | |
if __name__ == "__main__": | |
bf = BloomFilter() | |
bf.add(30000) | |
bf.add(1230213) | |
bf.add(1) | |
print("Filter size {0} bytes").format(bf.filter.__sizeof__()) | |
print bf.query(1) # True | |
print bf.query(1230213) # True | |
print bf.query(12) # False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
Look at this: https://github.com/daedalus/bloomfilter/blob/master/bloom.py