Skip to content

Instantly share code, notes, and snippets.

@fjsj
Created January 21, 2019 18:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fjsj/f7f544ebcedb1ad931a4d31cdc9d2fb5 to your computer and use it in GitHub Desktop.
Save fjsj/f7f544ebcedb1ad931a4d31cdc9d2fb5 to your computer and use it in GitHub Desktop.
Python bloom filters performance test (based on pybloomfilter's own test)
#! /usr/bin/env python
import os
import tempfile
import time
import timeit
import pybloomfilter
tempfiles = []
ERROR_RATE = 0.01
NUM_K = 7 # https://hur.st/bloomfilter/?n=97070&p=0.01&m=&k=
#def get_and_add_words(Creator, wordlist):
def get_and_add_words(Creator, wordlist):
bf = Creator(len(wordlist), ERROR_RATE)
if isinstance(bf, pybloomfilter.BloomFilter):
add_method = lambda word: bf.add(word.encode('utf-8'))
elif isinstance(bf, ScalableCuckooFilter):
add_method = bf.insert
else:
add_method = bf.add
for word in wordlist:
add_method(word)
return bf
def check_words(bf, wordlist):
if isinstance(bf, pybloomfilter.BloomFilter):
contains_method = lambda word: word.encode('utf-8') in bf
elif isinstance(bf, ScalableCuckooFilter):
contains_method = bf.contains
# elif isinstance(bf, pyreBloom.pyreBloom):
# contains_method = bf.contains
else:
contains_method = bf.__contains__
for word in wordlist:
contains_method(word)
def test_errors(bf, correct_wordlist, test_wordlist):
errors = [0, 0]
if isinstance(bf, pybloomfilter.BloomFilter):
contains_method = lambda word: word.encode('utf-8') in bf
elif isinstance(bf, ScalableCuckooFilter):
contains_method = bf.contains
else:
contains_method = bf.__contains__
for word in test_wordlist:
word = word
if contains_method(word):
if word not in correct_wordlist:
errors[0] += 1
else:
if word in correct_wordlist:
errors[1] += 1
print('%0.2f%% positive %0.2f%% negative' % (
errors[0] / float(len(correct_wordlist)) * 100,
errors[1] / float(len(correct_wordlist)) * 100))
def create_word_list(filename):
f = open(filename, 'r')
words_set = set()
for line in f:
line = line.strip().lower()
if line:
words_set.add(line)
f.close()
return words_set
def create_cbloomfilter(*args):
args = list(args)
f = tempfile.NamedTemporaryFile()
tempfiles.append(f)
os.unlink(f.name)
args.append(f.name.encode('utf-8'))
return pybloomfilter.BloomFilter(*tuple(args))
creators = [create_cbloomfilter]
import pybloom_live
creators.append(pybloom_live.BloomFilter)
from cuckoo.filter import ScalableCuckooFilter
creators.append(ScalableCuckooFilter)
import pybloof
creators.append(lambda words_len, error_rate: pybloof.StringBloomFilter(words_len, NUM_K))
# import pyreBloom
# creators.append(lambda words_len, error_rate: pyreBloom.pyreBloom('myBloomFilter'.encode('utf-8'), words_len, error_rate))
import mmh3
from probables.hashes import hash_with_depth_bytes
@hash_with_depth_bytes
def my_hash(key):
return mmh3.hash_bytes(key)
import probables
creators.append(lambda words_len, error_rate: probables.BloomFilter(words_len, error_rate, hash_function=my_hash))
creators = creators[::-1]
def run_test():
dict_wordlist = create_word_list('words')
test_wordlist = create_word_list('testwords')
NUM = 10
for creator in creators:
start = time.time()
if NUM:
t = timeit.Timer(lambda : get_and_add_words(creator, dict_wordlist))
print("%s took %0.5f s/run" % (
creator,
t.timeit(NUM) / float(NUM)))
bf = get_and_add_words(creator, dict_wordlist)
if NUM:
t = timeit.Timer(lambda : check_words(bf, test_wordlist))
print("%s took %0.5f s/run" % (
creator,
t.timeit(NUM) / float(NUM)))
test_errors(bf, dict_wordlist, test_wordlist)
print('--------------------')
if __name__ == "__main__":
run_test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment