Skip to content

Instantly share code, notes, and snippets.

@shuntksh
Last active November 4, 2023 04:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shuntksh/93aa4a7f0080d4b5a273b9bce1b97e38 to your computer and use it in GitHub Desktop.
Save shuntksh/93aa4a7f0080d4b5a273b9bce1b97e38 to your computer and use it in GitHub Desktop.
Bloom Filter in Typescript
/**
* BloomFilter is a probabilistic data structure used to test whether an element
* is a member of a set, allowing for false positives.
*/
export class BloomFilter {
static readonly DEFAULT_BIT_LEN = 8;
#size: number;
#bitArray: Uint8Array; // Uint8Array might be slower than BigInt64Array but easier to debug
#hashPassCount: number;
#count = 0;
constructor(expectedItems: number, falsePositiveRate: number = 0.01) {
// Calculate the size of the bit array (m)
// then find the next power of two for the size to avoid modulo bias.
const _size = Math.ceil((-expectedItems * Math.log(falsePositiveRate)) / Math.pow(Math.log(2), 2));
this.#size = Math.pow(2, Math.ceil(Math.log(_size) / Math.log(2)));
this.#hashPassCount = this.#getHashPassCount(this.#size);
this.#bitArray = new Uint8Array(Math.ceil(this.#size / BloomFilter.DEFAULT_BIT_LEN));
console.log(this.#size, this.#hashPassCount);
}
get count() {
return this.#count;
}
getBitArray() {
return new Uint8Array(this.#bitArray);
}
/**
* Add a value to the BloomFilter.
*/
add(value: string) {
if (this.has(value)) return;
if (this.#isCloseToLimit()) this.resize();
for (let i = 0; i < this.#hashPassCount; i++) {
const hash = this.#combinedHash(value, i);
this.#setBit(hash);
}
this.#count++;
}
/**
* Check whether a value is possibly in the BloomFilter.
*/
has(value: string): boolean {
for (let i = 0; i < this.#hashPassCount; i++) {
const hash = this.#combinedHash(value, i);
if (!this.#getBit(hash)) return false;
}
return true;
}
/**
* Resizes the bit array and increases the number of hash functions.
* This is done by doubling the size of the bit array and recalculating the optimal hash count.
*/
resize() {
const newSize = this.#size * 2;
const newBitArray = new Uint8Array(newSize);
newBitArray.set(this.#bitArray);
this.#size = newSize;
this.#bitArray = newBitArray;
this.#hashPassCount = this.#getHashPassCount();
}
/**
* Gets the bit value at a specific index in the Bloom filter.
*/
#getBit(index: number): boolean {
const byteIndex = Math.floor(index / BloomFilter.DEFAULT_BIT_LEN);
const bitIndex = index % BloomFilter.DEFAULT_BIT_LEN;
return (this.#bitArray[byteIndex] & (1 << bitIndex)) !== 0;
}
/**
* Sets a bit to 1 at a specific index in the Bloom filter.
*/
#setBit(index: number) {
const byteIndex = Math.floor(index / BloomFilter.DEFAULT_BIT_LEN);
const bitIndex = index % BloomFilter.DEFAULT_BIT_LEN;
this.#bitArray[byteIndex] |= 1 << bitIndex;
}
#combinedHash(value: string, salt: number): number {
const combinedValue = value + salt.toString();
const hash1 = this.#hashDJB2(combinedValue);
const hash2 = this.#hashSDBM(combinedValue);
const hash3 = this.#hashMurmur3(combinedValue);
const hash4 = this.#hashFNV1a(combinedValue);
return ((hash1 ^ hash2 ^ hash3 ^ hash4) >>> 0) % this.#size;
}
/**
* DJB2 hashing function.
* A simple and effective string hashing algorithm. It’s used as one of the
* hashing functions in the Bloom filter, assisting in minimizing the
* likelihood of collision.
*
* See: http://www.cse.yorku.ca/~oz/hash.html
*
*/
#hashDJB2(value: string): number {
// Magic constant helps in reducing the chance of collision
let hash = 5381;
// Bitwise XOR and multiplication by prime number (33) to scramble the bits.
for (let i = 0; i < value.length; i++) hash = (hash * 33) ^ value.charCodeAt(i);
// Ensure a positive number within the range of the bit array size.
return (hash >>> 0) % this.#size;
}
/**
* SDBM hashing function.
* A hash function suitable for hash table lookup. It’s used within the Bloom filter
* to create hash values, making it able to ascertain element membership.
*
* see: http://www.cse.yorku.ca/~oz/hash.html
*
*/
#hashSDBM(value: string): number {
let hash = 0;
// Combination of bitwise operations and arithmetic to scramble the bits.
for (let i = 0; i < value.length; i++) hash = value.charCodeAt(i) + (hash << 6) + (hash << 16) - hash;
// Ensure a positive number within the range of the bit array size.
return (hash >>> 0) % this.#size;
}
/**
* MurmurHash3 hashing function.
* This is a non-cryptographic, fast hash function. It takes in a string value
* and produces a 32-bit hash. The function is utilized to generate hash values
* to be used in the Bloom filter, allowing it to decide the presence of an element.
*
* see: https://en.wikipedia.org/wiki/MurmurHash
*
*/
#hashMurmur3(value: string): number {
// Constants are defined for use in the bitwise operations, part of MurmurHash3 specification
const c1 = 0xcc9e2d51;
const c2 = 0x1b873593;
let hash = 0;
const chunks = Math.ceil(value.length / 4);
for (let c = 0; c < chunks; c++) {
let k = 0;
for (let i = 0; i < 4; i++) {
// Left shift operation for ordering bytes, and OR operation to combine them.
if (c * 4 + i < value.length) k |= value.charCodeAt(c * 4 + i) << (i * 8);
}
// Multiply k by a constant, followed by bitwise rotation and multiplication by another constant.
k = k * c1;
k = (k << 15) | (k >>> 17); // Bitwise rotation (left by 15, right by 17)
k = k * c2;
// XOR operation with hash, followed by bitwise rotations and multiplication.
hash ^= k;
hash = (hash << 13) | (hash >>> 19); // Bitwise rotation (left by 13, right by 19)
hash = hash * 5 + 0xe6546b64;
}
// XOR operation with length of string.
hash ^= value.length;
// Series of bitwise operations to finalize the hash value.
hash ^= hash >>> 16;
hash *= 0x85ebca6b; // Multiplication with constant
hash ^= hash >>> 13; // XOR after bitwise rotation
hash *= 0xc2b2ae35; // Multiplication with constant
hash ^= hash >>> 16; // XOR after bitwise rotation
// Ensuring the hash value is positive and fits within the size of the bit array.
return (hash >>> 0) % this.#size;
}
/**
* FNV-1a hash function.
* Created by Glenn Fowler, Landon Curt Noll, and Kiem-Phong Vo in 1991.
*
* see: http://www.isthe.com/chongo/tech/comp/fnv/index.html
*
*/
#hashFNV1a(value: string): number {
const FNV_prime = 0x01000193; // FNV prime value for 32-bit hash
let hash = 0x811c9dc5; // FNV offset basis for 32-bit hash
for (let i = 0; i < value.length; i++) {
const byte_of_data = value.charCodeAt(i);
hash ^= byte_of_data; // XOR byte of data into the bottom of the hash.
hash *= FNV_prime; // Multiply by FNV prime to disperse the hash.
}
return (hash >>> 0) % this.#size; // ensure the hash is unsigned
}
#isCloseToLimit() {
return this.#count / this.#size >= 0.5; // Threshold can be adjusted
}
/**
* Calculates the optimal number of hash functions based on the current size of the bit array
* and the desired false positive rate.
*/
#getHashPassCount(size: number = this.#size) {
// The formula for the optimal number of hash functions is k = (m/n) * ln(2)
// However, we need an estimate for n. We'll use the current size as an approximation for n.
// This is a simplification and not technically accurate.
const n = size * 0.25; // This is an approximation
const k = Math.round((size / n) * Math.log(2)); // Calculate the optimal number of hash functions
return Math.max(1, k); // Ensure we have at least one hash function
}
}
import { fc, test as testFC } from "@fast-check/vitest";
import { webcrypto } from "node:crypto";
import { describe, expect, test } from "vitest";
import { BloomFilter } from "./bloom-filter";
describe("BloomFilter", () => {
test("should add and check values correctly", () => {
const bf = new BloomFilter(1000);
bf.add("test");
expect(bf.has("test")).toBe(true);
expect(bf.has("unknown")).toBe(false);
});
test("should handle different string lengths", () => {
const bf = new BloomFilter(1000);
bf.add("a");
bf.add("abcdefghijklmnopqrstuvwxyz");
expect(bf.has("a")).toBe(true);
expect(bf.has("abcdefghijklmnopqrstuvwxyz")).toBe(true);
});
test("false positive rate should be reasonable", () => {
const targetFalsePositiveRate = 0.01;
const bf = new BloomFilter(1500, targetFalsePositiveRate);
const testData: string[] = [];
const falseData: string[] = [];
// Generating test data
for (let i = 0; i < 1000; i++) {
testData.push(webcrypto.randomUUID());
falseData.push(webcrypto.randomUUID());
}
// Adding test data to bloom filter
testData.forEach((val) => bf.add(val));
// Checking false positive rate
let falsePositiveCount = 0;
falseData.forEach((val) => {
if (bf.has(val)) falsePositiveCount++;
});
const falsePositiveRate = falsePositiveCount / falseData.length;
expect(falsePositiveRate).toBeLessThanOrEqual(targetFalsePositiveRate); // Adjust the expectation based on your tolerance
});
test("should handle a large number of entries", () => {
const bf = new BloomFilter(10000);
for (let i = 0; i < 10000; i++) {
bf.add(`test${i}`);
}
for (let i = 0; i < 10000; i++) {
expect(bf.has(`test${i}`)).toBe(true);
}
// Checking existence
expect(bf.has("test10000")).toBe(false);
});
});
describe("BloomFilter Property Tests", () => {
testFC.prop([fc.asciiString()])("should handle ASCII strings", (str) => {
const bf = new BloomFilter(1000);
expect(bf.has(str)).toBe(false);
bf.add(str);
expect(bf.has(str)).toBe(true);
});
testFC.prop([fc.unicodeString()])("should handle Unicode strings", (str) => {
const bf = new BloomFilter(1000);
expect(bf.has(str)).toBe(false);
bf.add(str);
expect(bf.has(str)).toBe(true);
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment