hiway/ubloom.py

## ubloom.py
# coding=utf-8
try:
    import urandom as _random
    import uhashlib as _hashlib
    import ubinascii as _binascii

except ImportError:
    import random as _random
    import hashlib as _hashlib
    import binascii as _binascii


class BloomFilter:
    """
    A lightweight bloom filter for Micropython and Python 3 with a tiny memory footprint.

    Reference:
    - http://code.activestate.com/recipes/577684-bloom-filter/
    - http://maciejczyzewski.me/2014/10/18/bloom-filters-fast-and-simple.html
    """

    def __init__(self, size: int = 128, probes: int = 3, iterable: tuple = ()):
        self.array = bytearray(size)
        self._bit_count = size * 8  # each byte in the bytearray = 8 bits
        self._probes = probes
        self.update(iterable)

    def __contains__(self, key):
        return all(self.array[i // 8] & (2 ** (i % 8)) for i in self._get_probes(key))

    def _get_probes(self, key: str):
        digest = int(_binascii.hexlify(_hashlib.sha256(key.encode()).digest()), 16)
        for _ in range(self._probes):
            # yield h & ((2 ** 13) - 1)
            # yield h & 8191
            yield digest & self._bit_count - 1
            digest >>= 256 // self._probes

    def add(self, key: str):
        for i in self._get_probes(key):
            self.array[i // 8] |= 2 ** (i % 8)

    def update(self, keys):
        for key in keys:
            self.add(key)


if __name__ == '__main__':
    positives = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
            Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
            Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
            Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
            NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
            Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
            Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
    negatives = """lkjdhf lkdjh djk gjhd fhv jdhbfjdbflkj ad flakjdf kldajf kdjfn dkjn
            fdkj fkdjb fdjbf jabd jkh hja dvfjhabd jsbdjhas bdjhab sdjbasdjahbs
            dhjasv dhv dks78t3 4uy3g beg f8yg u3hb433fi 7ku1bi76f3r vgqdhbbfi76erf
            kjh 37wo2jo3kdkg i3yig iu 3geakjge3h kjdh uy eiu hejnkhfd hdofjdkjfld""".split()

    bf = BloomFilter(iterable=positives)

    m = sum(p in bf for p in positives)
    print('%d true positives and %d false negatives out of %d positive trials'
          % (m, len(positives) - m, len(positives)))

    m = sum(b not in bf for b in negatives)
    print('%d true negatives and %d false positives out of %d negative trials'
          % (m, len(negatives) - m, len(negatives)))

    try:
        from random import sample
        from string import ascii_letters

        trials = 100000
        m = sum(''.join(sample(ascii_letters, 8)) in bf for i in range(trials))
        print('%d true negatives and %d false positives out of %d negative trials'
              % (trials - m, m, trials))
        print(100 - (((trials - m) / trials) * 100))
    except ImportError:  # fails on micropython
        pass

    c = ''.join('08b'.format(x) for x in bf.array)
    print('Bit density:', c.count('1') / float(len(c)))
	# coding=utf-8
	try:
	import urandom as _random
	import uhashlib as _hashlib
	import ubinascii as _binascii

	except ImportError:
	import random as _random
	import hashlib as _hashlib
	import binascii as _binascii


	class BloomFilter:
	"""
	A lightweight bloom filter for Micropython and Python 3 with a tiny memory footprint.

	Reference:
	- http://code.activestate.com/recipes/577684-bloom-filter/
	- http://maciejczyzewski.me/2014/10/18/bloom-filters-fast-and-simple.html
	"""

	def __init__(self, size: int = 128, probes: int = 3, iterable: tuple = ()):
	self.array = bytearray(size)
	self._bit_count = size * 8 # each byte in the bytearray = 8 bits
	self._probes = probes
	self.update(iterable)

	def __contains__(self, key):
	return all(self.array[i // 8] & (2 ** (i % 8)) for i in self._get_probes(key))

	def _get_probes(self, key: str):
	digest = int(_binascii.hexlify(_hashlib.sha256(key.encode()).digest()), 16)
	for _ in range(self._probes):
	# yield h & ((2 ** 13) - 1)
	# yield h & 8191
	yield digest & self._bit_count - 1
	digest >>= 256 // self._probes

	def add(self, key: str):
	for i in self._get_probes(key):
	self.array[i // 8] \|= 2 ** (i % 8)

	def update(self, keys):
	for key in keys:
	self.add(key)


	if __name__ == '__main__':
	positives = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
	Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
	Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
	Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
	NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
	Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
	Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
	negatives = """lkjdhf lkdjh djk gjhd fhv jdhbfjdbflkj ad flakjdf kldajf kdjfn dkjn
	fdkj fkdjb fdjbf jabd jkh hja dvfjhabd jsbdjhas bdjhab sdjbasdjahbs
	dhjasv dhv dks78t3 4uy3g beg f8yg u3hb433fi 7ku1bi76f3r vgqdhbbfi76erf
	kjh 37wo2jo3kdkg i3yig iu 3geakjge3h kjdh uy eiu hejnkhfd hdofjdkjfld""".split()

	bf = BloomFilter(iterable=positives)

	m = sum(p in bf for p in positives)
	print('%d true positives and %d false negatives out of %d positive trials'
	% (m, len(positives) - m, len(positives)))

	m = sum(b not in bf for b in negatives)
	print('%d true negatives and %d false positives out of %d negative trials'
	% (m, len(negatives) - m, len(negatives)))

	try:
	from random import sample
	from string import ascii_letters

	trials = 100000
	m = sum(''.join(sample(ascii_letters, 8)) in bf for i in range(trials))
	print('%d true negatives and %d false positives out of %d negative trials'
	% (trials - m, m, trials))
	print(100 - (((trials - m) / trials) * 100))
	except ImportError: # fails on micropython
	pass

	c = ''.join('08b'.format(x) for x in bf.array)
	print('Bit density:', c.count('1') / float(len(c)))