Skip to content

Instantly share code, notes, and snippets.

@techbelly
Created January 7, 2011 02:26
Show Gist options
  • Save techbelly/769015 to your computer and use it in GitHub Desktop.
Save techbelly/769015 to your computer and use it in GitHub Desktop.
import hashlib
from BitVector import BitVector
class BloomFilter:
def __init__(self,bits,hashes,digest=hashlib.md5):
self.bits = bits
self.hashes = hashes
self.digest = digest
self.check_digest(bits,hashes,digest)
self.bloom = BitVector(intVal = 0, size = (2**bits))
# self.bloom = [False] * (2**bits)
def check_digest(self,bits,hashes,digest):
if digest().digest_size*8 < bits * hashes:
raise ValueError( \
'Digest size of %d bytes, for %d hashes of %d bits not enough' % \
(digest().digest_size, hashes, bits))
def hashes_for(self,string):
def chunks(l, n):
for i in xrange(0, len(l), n):
yield l[i:i+n]
def first(n,iter):
return (x for i,x in enumerate(iter) if i < n)
m = self.digest()
m.update(string.lower())
as_int = int(m.hexdigest(),16)
as_bin = bin(as_int).lstrip('0b')
return [int(chunk,2) for chunk in first(self.hashes,chunks(as_bin,self.bits))]
def insert(self,string):
for i in self.hashes_for(string):
self.bloom[i] = 1
def contains(self,string):
return all(self.bloom[i] for i in self.hashes_for(string))
bits = 20 #
hashes = 4
digest = hashlib.md5
bloom = BloomFilter(bits, hashes, digest)
print "LOADING DICTIONARY"
for l in open('/usr/share/dict/words'):
bloom.insert(l.strip())
def misspelt_words_in(text):
return [word for word in text.split() if not bloom.contains(word)]
text = """The bits of a bit array are stored in 16-bit unsigned ints. After
resolving the argument with which the constructor is called (which
happens in lines (A2) through (A70) of the file BitVector.py), the
very first thing that the constructor does is to figure out in line
(A78) as to how many of those 2-byte ints it needs for the bits.
For example, if you wanted to store a 64-bit array, the variable
'two_byte_ints_needed' in line (A78) would be set to 4. (This does
not mean that the size of a bit vector must be a multiple of 16.
Any sized bit vectors can constructed using the required number of
two-byte ints.) Line (A79) then creates an array of 2-byte ints and
initializes it with the required number of zeros. Lines (A80) then
shifts the bits into the array of two-byte ints.
This is a taest of sume mispelling and zarf"""
# More a test of words in /usr/share/dict/words than a spelling test, unfortunately.
print misspelt_words_in(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment