quiver/counting_bloom_filter.py

## counting_bloom_filter.py
# http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters
# http://www.michaelnielsen.org/ddi/why-bloom-filters-work-the-way-they-do/
import hashlib

class BloomFilter(object):

    def __init__(self, m, k):
        self.m = m
        self.k = k
        self.array = [0 for i in range(m)]
        #import scipy # if scipy installed
        #self.array = scipy.zeros(m, dtype=int)

    def __str__(self):
        return self.array.__str__()

    def add(self, key):
        for i in self._hashes(key):
            self.array[i] += 1

    def remove(self, key):
        if self.contains(key):
            for i in self._hashes(key):
                self.array[i] -= 1

    def contains(self, key):
        return all(self.array[i] for i in self._hashes(key))

    def _hashes(self, key):
        """
        generate k different hashes, each of which maps a key to one of
        the m array positions with a uniform random distribution
        """
        h = hashlib.new('md5')
        h.update(str(key))
        x = 0
        for _ in xrange(self.k):
            if x < self.m:
                h.update('.')
                x = long(h.hexdigest(), 16)
            x, y = divmod(x, self.m)
            yield y

def test_bloom(m, k, n):
    b = BloomFilter(m, k)
    for i in xrange(n):
        b.add(i)
        assert b.contains(i)

    print b

    b.remove(3)
    print b

if __name__ == '__main__':
    test_bloom(100, 3, 10)
	# http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters
	# http://www.michaelnielsen.org/ddi/why-bloom-filters-work-the-way-they-do/
	import hashlib

	class BloomFilter(object):

	def __init__(self, m, k):
	self.m = m
	self.k = k
	self.array = [0 for i in range(m)]
	#import scipy # if scipy installed
	#self.array = scipy.zeros(m, dtype=int)

	def __str__(self):
	return self.array.__str__()

	def add(self, key):
	for i in self._hashes(key):
	self.array[i] += 1

	def remove(self, key):
	if self.contains(key):
	for i in self._hashes(key):
	self.array[i] -= 1

	def contains(self, key):
	return all(self.array[i] for i in self._hashes(key))

	def _hashes(self, key):
	"""
	generate k different hashes, each of which maps a key to one of
	the m array positions with a uniform random distribution
	"""
	h = hashlib.new('md5')
	h.update(str(key))
	x = 0
	for _ in xrange(self.k):
	if x < self.m:
	h.update('.')
	x = long(h.hexdigest(), 16)
	x, y = divmod(x, self.m)
	yield y

	def test_bloom(m, k, n):
	b = BloomFilter(m, k)
	for i in xrange(n):
	b.add(i)
	assert b.contains(i)

	print b

	b.remove(3)
	print b

	if __name__ == '__main__':
	test_bloom(100, 3, 10)