Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A Bloom filter based on Max Burstein's implementation (http://maxburstein.com/blog/creating-a-simple-bloom-filter/)
from bitarray import bitarray
import mmh3
from math import log, pow, ceil
class BloomFilter:
def __init__(self, size, hash_count):
self.size = size
self.hash_count = hash_count
self.bit_array = bitarray(size)
self.bit_array.setall(0)
def add(self, string):
for seed in xrange(self.hash_count):
result = mmh3.hash(string, seed) % self.size
self.bit_array[result] = 1
def lookup(self, string):
for seed in xrange(self.hash_count):
result = mmh3.hash(string, seed) % self.size
if self.bit_array[result] == 0:
return False
return True
@staticmethod
def suggest_sizes(n, p):
"""Given an expected number of items and probability of false positives, suggests an appropriate size and hash count for a Bloom filter.
n - expected number of items in filter
p - probability of false positives (0.0 - 1.0)
http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives"""
if not (0.0 <= p <= 1.0):
raise ValueError("False probability percentage must be between 0.0 and 1.0")
if n <= 0:
raise ValueError("Number of items must be greater than 0")
l2 = log(2)
m = -n*log(p)/pow(l2, 2)
k = (m/n) * l2
return (int(ceil(m)), int(ceil(k)))
@staticmethod
def create_suggested(n, p):
"""Given an expected number of items and probability of false positives, returns a BloomFilter of appropriate size and hash count.
n - expected number of items in filter
p - probability of false positives (0.0 - 1.0)"""
suggested_sizes = BloomFilter.suggest_sizes(n, p)
return BloomFilter(suggested_sizes[0], suggested_sizes[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment