Skip to content

Instantly share code, notes, and snippets.

@imvladikon
Created October 9, 2022 12:34
Show Gist options
  • Save imvladikon/20c7ca20b5c73cc39d309c05548a616c to your computer and use it in GitHub Desktop.
Save imvladikon/20c7ca20b5c73cc39d309c05548a616c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import hashlib
import numpy as np
from typing import List, Set
def iter_ngrams(s, ngram=2):
for i in range(len(s) - ngram + 1):
yield s[i: i + ngram]
def flip_bloom_filter(string: str, bf_len: int, num_hash_funct: int):
"""
Hash string and return indices of bits that have been flipped correspondingly.
:param string: string: to be hashed and to flip bloom filter
:param bf_len: int: length of bloom filter
:param num_hash_funct: int: number of hash functions
:return: bfset: a set of integers - indices that have been flipped to 1
"""
# config for hashing
h1 = hashlib.sha1
h2 = hashlib.md5
sha_bytes = h1(string.encode('utf-8')).digest()
md5_bytes = h2(string.encode('utf-8')).digest()
int1 = int.from_bytes(sha_bytes, 'big') % bf_len
int2 = int.from_bytes(md5_bytes, 'big') % bf_len
# flip {num_hash_funct} times
bfset = set()
for i in range(num_hash_funct):
gi = (int1 + i * int2) % bf_len
bfset.add(gi)
return bfset
def record_to_bf(ngrams, bf_len, num_hash_function):
candidate_bloom_filter = set()
for signature in ngrams:
bfset = flip_bloom_filter(signature, bf_len, num_hash_function)
# union indices that have been flipped 1 in candidate bf
candidate_bloom_filter = candidate_bloom_filter.union(bfset)
# massage the cbf into a numpy bool array from a set
bloom_filter_vector = np.zeros(bf_len, dtype=bool)
bloom_filter_vector[list(candidate_bloom_filter)] = True
return bloom_filter_vector
print(record_to_bf(iter_ngrams("john doe"), 500, 16))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment