Created
June 10, 2020 11:48
-
-
Save azizkayumov/74d5e79931187338eed078e9bb8dd8c2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This gist aims to test BloomFilter with k-slices | |
# Assume we have a giant set of English words, | |
# user enters any string to know if the string is an English word | |
# It is important not to use HashMap or HashSet because of memory contraints | |
# | |
# Steps to follow before using: | |
# 1. Download English words from this repo: https://github.com/dwyl/english-words/blob/master/words_alpha.txt | |
# 2. Open command line and install murmur hash: | |
# pip3 install mmh3 | |
# 3. Run sbf.py from command line: | |
# python3 sbf.py | |
from math import ceil, log2, log | |
from bitstring import BitArray | |
import mmh3 # murmur hash | |
import os # get pid | |
class BloomFilter: | |
def __init__(self, N, P): | |
p = 1 / 2 | |
self.m = ceil(- N / log(p)) | |
self.k = ceil(log2(1 / P)) | |
self.slices = [] | |
for i in range(self.k): | |
self.slices.append(BitArray(self.m)) | |
def add(self, value): | |
for slice in range(self.k): | |
index = self.hash(slice, value) | |
self.slices[slice][index] = 1 | |
def contains(self, value): | |
for slice in range(self.k): | |
index = self.hash(slice, value) | |
if self.slices[slice][index] == 0: | |
return False | |
return True | |
def hash(self, seed, value): | |
value = value.lower() | |
h = mmh3.hash(value, seed) | |
return h % self.m | |
file = open("words.txt",'r') | |
words = file.read().splitlines() # to ignore '\n's | |
N = len(words) | |
P = 0.001 | |
bf = BloomFilter(N, P) | |
print("BloomFilter initialized with: ") | |
print("m = ", bf.m) | |
print("k = ", bf.k) | |
print("N = ", N) | |
for word in words: | |
bf.add(word) | |
print("PID = ", os.getpid()) | |
s = input("Enter any word: ") | |
while s: | |
if bf.contains(s): | |
print(s + " may be an English word.\n") | |
else: | |
print(s + " is not an English word.\n") | |
s = input("Enter any word: ") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment