public
Last active

Bloom Filter in python (see https://gist.github.com/4700640)

  • Download Gist
gistfile1.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
from bitarray import bitarray
import mmh3
 
# gist adapted from mburst's work
# https://gist.github.com/4700640
 
# using this dictionary:
# http://manpages.ubuntu.com/manpages/precise/man5/american-english-huge.5.html
 
# comments on reddit
# http://www.reddit.com/r/programming/comments/17slhd/creating_a_simple_bloom_filter/
 
WORDLIST_FILE = "/usr/share/dict/american-english-huge"
 
class BloomFilter:
def __init__(self, size, hash_count):
self.size = size
self.hash_count = hash_count
self.bit_array = bitarray(size)
self.bit_array.setall(0)
def add(self, string):
for seed in xrange(self.hash_count):
result = mmh3.hash(string, seed) % self.size
self.bit_array[result] = 1
def lookup(self, string):
for seed in xrange(self.hash_count):
result = mmh3.hash(string, seed) % self.size
if self.bit_array[result] == 0:
return "Nope"
return "Probably"
 
bf = BloomFilter(500000, 7)
linearWords = []
setWords = set()
 
wordFile = open(WORDLIST_FILE)
for line in wordFile.readlines():
line = line.strip()
bf.add(line)
linearWords.append(line)
setWords.add(line)
 
import datetime
 
def lookup(word):
global linearWords
global setWords
global bf
print "Lookup for %s" % word
start = datetime.datetime.now()
bf.lookup(word)
finish = datetime.datetime.now()
print "Bloom Filter search finished in ", (finish-start).microseconds
 
start = datetime.datetime.now()
for w in linearWords:
if w == word:
break
finish = datetime.datetime.now()
print "Linear search finished in " , (finish-start).microseconds
 
start = datetime.datetime.now()
_ = (word in setWords)
finish = datetime.datetime.now()
print "Set lookup finished in", (finish-start).microseconds
 
print "---------"
 
lookup("google")
lookup("apple")
lookup("diaphanous")

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.