Skip to content

@swirepe /gist:4703275
Last active

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Bloom Filter in python (see https://gist.github.com/4700640)
from bitarray import bitarray
import mmh3
# gist adapted from mburst's work
# https://gist.github.com/4700640
# using this dictionary:
# http://manpages.ubuntu.com/manpages/precise/man5/american-english-huge.5.html
# comments on reddit
# http://www.reddit.com/r/programming/comments/17slhd/creating_a_simple_bloom_filter/
WORDLIST_FILE = "/usr/share/dict/american-english-huge"
class BloomFilter:
def __init__(self, size, hash_count):
self.size = size
self.hash_count = hash_count
self.bit_array = bitarray(size)
self.bit_array.setall(0)
def add(self, string):
for seed in xrange(self.hash_count):
result = mmh3.hash(string, seed) % self.size
self.bit_array[result] = 1
def lookup(self, string):
for seed in xrange(self.hash_count):
result = mmh3.hash(string, seed) % self.size
if self.bit_array[result] == 0:
return "Nope"
return "Probably"
bf = BloomFilter(500000, 7)
linearWords = []
setWords = set()
wordFile = open(WORDLIST_FILE)
for line in wordFile.readlines():
line = line.strip()
bf.add(line)
linearWords.append(line)
setWords.add(line)
import datetime
def lookup(word):
global linearWords
global setWords
global bf
print "Lookup for %s" % word
start = datetime.datetime.now()
bf.lookup(word)
finish = datetime.datetime.now()
print "Bloom Filter search finished in ", (finish-start).microseconds
start = datetime.datetime.now()
for w in linearWords:
if w == word:
break
finish = datetime.datetime.now()
print "Linear search finished in " , (finish-start).microseconds
start = datetime.datetime.now()
_ = (word in setWords)
finish = datetime.datetime.now()
print "Set lookup finished in", (finish-start).microseconds
print "---------"
lookup("google")
lookup("apple")
lookup("diaphanous")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.