Skip to content

Instantly share code, notes, and snippets.

@hiway
Created August 3, 2017 20:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hiway/2b7f5c795e9fd28ce33ffa5ae90fbfa8 to your computer and use it in GitHub Desktop.
Save hiway/2b7f5c795e9fd28ce33ffa5ae90fbfa8 to your computer and use it in GitHub Desktop.
A lightweight bloom filter for Micropython and Python 3 with a tiny memory footprint.
# coding=utf-8
try:
import urandom as _random
import uhashlib as _hashlib
import ubinascii as _binascii
except ImportError:
import random as _random
import hashlib as _hashlib
import binascii as _binascii
class BloomFilter:
"""
A lightweight bloom filter for Micropython and Python 3 with a tiny memory footprint.
Reference:
- http://code.activestate.com/recipes/577684-bloom-filter/
- http://maciejczyzewski.me/2014/10/18/bloom-filters-fast-and-simple.html
"""
def __init__(self, size: int = 128, probes: int = 3, iterable: tuple = ()):
self.array = bytearray(size)
self._bit_count = size * 8 # each byte in the bytearray = 8 bits
self._probes = probes
self.update(iterable)
def __contains__(self, key):
return all(self.array[i // 8] & (2 ** (i % 8)) for i in self._get_probes(key))
def _get_probes(self, key: str):
digest = int(_binascii.hexlify(_hashlib.sha256(key.encode()).digest()), 16)
for _ in range(self._probes):
# yield h & ((2 ** 13) - 1)
# yield h & 8191
yield digest & self._bit_count - 1
digest >>= 256 // self._probes
def add(self, key: str):
for i in self._get_probes(key):
self.array[i // 8] |= 2 ** (i % 8)
def update(self, keys):
for key in keys:
self.add(key)
if __name__ == '__main__':
positives = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
negatives = """lkjdhf lkdjh djk gjhd fhv jdhbfjdbflkj ad flakjdf kldajf kdjfn dkjn
fdkj fkdjb fdjbf jabd jkh hja dvfjhabd jsbdjhas bdjhab sdjbasdjahbs
dhjasv dhv dks78t3 4uy3g beg f8yg u3hb433fi 7ku1bi76f3r vgqdhbbfi76erf
kjh 37wo2jo3kdkg i3yig iu 3geakjge3h kjdh uy eiu hejnkhfd hdofjdkjfld""".split()
bf = BloomFilter(iterable=positives)
m = sum(p in bf for p in positives)
print('%d true positives and %d false negatives out of %d positive trials'
% (m, len(positives) - m, len(positives)))
m = sum(b not in bf for b in negatives)
print('%d true negatives and %d false positives out of %d negative trials'
% (m, len(negatives) - m, len(negatives)))
try:
from random import sample
from string import ascii_letters
trials = 100000
m = sum(''.join(sample(ascii_letters, 8)) in bf for i in range(trials))
print('%d true negatives and %d false positives out of %d negative trials'
% (trials - m, m, trials))
print(100 - (((trials - m) / trials) * 100))
except ImportError: # fails on micropython
pass
c = ''.join('08b'.format(x) for x in bf.array)
print('Bit density:', c.count('1') / float(len(c)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment