Last active
November 4, 2023 04:23
-
-
Save shuntksh/93aa4a7f0080d4b5a273b9bce1b97e38 to your computer and use it in GitHub Desktop.
Bloom Filter in Typescript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* BloomFilter is a probabilistic data structure used to test whether an element | |
* is a member of a set, allowing for false positives. | |
*/ | |
export class BloomFilter { | |
static readonly DEFAULT_BIT_LEN = 8; | |
#size: number; | |
#bitArray: Uint8Array; // Uint8Array might be slower than BigInt64Array but easier to debug | |
#hashPassCount: number; | |
#count = 0; | |
constructor(expectedItems: number, falsePositiveRate: number = 0.01) { | |
// Calculate the size of the bit array (m) | |
// then find the next power of two for the size to avoid modulo bias. | |
const _size = Math.ceil((-expectedItems * Math.log(falsePositiveRate)) / Math.pow(Math.log(2), 2)); | |
this.#size = Math.pow(2, Math.ceil(Math.log(_size) / Math.log(2))); | |
this.#hashPassCount = this.#getHashPassCount(this.#size); | |
this.#bitArray = new Uint8Array(Math.ceil(this.#size / BloomFilter.DEFAULT_BIT_LEN)); | |
console.log(this.#size, this.#hashPassCount); | |
} | |
get count() { | |
return this.#count; | |
} | |
getBitArray() { | |
return new Uint8Array(this.#bitArray); | |
} | |
/** | |
* Add a value to the BloomFilter. | |
*/ | |
add(value: string) { | |
if (this.has(value)) return; | |
if (this.#isCloseToLimit()) this.resize(); | |
for (let i = 0; i < this.#hashPassCount; i++) { | |
const hash = this.#combinedHash(value, i); | |
this.#setBit(hash); | |
} | |
this.#count++; | |
} | |
/** | |
* Check whether a value is possibly in the BloomFilter. | |
*/ | |
has(value: string): boolean { | |
for (let i = 0; i < this.#hashPassCount; i++) { | |
const hash = this.#combinedHash(value, i); | |
if (!this.#getBit(hash)) return false; | |
} | |
return true; | |
} | |
/** | |
* Resizes the bit array and increases the number of hash functions. | |
* This is done by doubling the size of the bit array and recalculating the optimal hash count. | |
*/ | |
resize() { | |
const newSize = this.#size * 2; | |
const newBitArray = new Uint8Array(newSize); | |
newBitArray.set(this.#bitArray); | |
this.#size = newSize; | |
this.#bitArray = newBitArray; | |
this.#hashPassCount = this.#getHashPassCount(); | |
} | |
/** | |
* Gets the bit value at a specific index in the Bloom filter. | |
*/ | |
#getBit(index: number): boolean { | |
const byteIndex = Math.floor(index / BloomFilter.DEFAULT_BIT_LEN); | |
const bitIndex = index % BloomFilter.DEFAULT_BIT_LEN; | |
return (this.#bitArray[byteIndex] & (1 << bitIndex)) !== 0; | |
} | |
/** | |
* Sets a bit to 1 at a specific index in the Bloom filter. | |
*/ | |
#setBit(index: number) { | |
const byteIndex = Math.floor(index / BloomFilter.DEFAULT_BIT_LEN); | |
const bitIndex = index % BloomFilter.DEFAULT_BIT_LEN; | |
this.#bitArray[byteIndex] |= 1 << bitIndex; | |
} | |
#combinedHash(value: string, salt: number): number { | |
const combinedValue = value + salt.toString(); | |
const hash1 = this.#hashDJB2(combinedValue); | |
const hash2 = this.#hashSDBM(combinedValue); | |
const hash3 = this.#hashMurmur3(combinedValue); | |
const hash4 = this.#hashFNV1a(combinedValue); | |
return ((hash1 ^ hash2 ^ hash3 ^ hash4) >>> 0) % this.#size; | |
} | |
/** | |
* DJB2 hashing function. | |
* A simple and effective string hashing algorithm. It’s used as one of the | |
* hashing functions in the Bloom filter, assisting in minimizing the | |
* likelihood of collision. | |
* | |
* See: http://www.cse.yorku.ca/~oz/hash.html | |
* | |
*/ | |
#hashDJB2(value: string): number { | |
// Magic constant helps in reducing the chance of collision | |
let hash = 5381; | |
// Bitwise XOR and multiplication by prime number (33) to scramble the bits. | |
for (let i = 0; i < value.length; i++) hash = (hash * 33) ^ value.charCodeAt(i); | |
// Ensure a positive number within the range of the bit array size. | |
return (hash >>> 0) % this.#size; | |
} | |
/** | |
* SDBM hashing function. | |
* A hash function suitable for hash table lookup. It’s used within the Bloom filter | |
* to create hash values, making it able to ascertain element membership. | |
* | |
* see: http://www.cse.yorku.ca/~oz/hash.html | |
* | |
*/ | |
#hashSDBM(value: string): number { | |
let hash = 0; | |
// Combination of bitwise operations and arithmetic to scramble the bits. | |
for (let i = 0; i < value.length; i++) hash = value.charCodeAt(i) + (hash << 6) + (hash << 16) - hash; | |
// Ensure a positive number within the range of the bit array size. | |
return (hash >>> 0) % this.#size; | |
} | |
/** | |
* MurmurHash3 hashing function. | |
* This is a non-cryptographic, fast hash function. It takes in a string value | |
* and produces a 32-bit hash. The function is utilized to generate hash values | |
* to be used in the Bloom filter, allowing it to decide the presence of an element. | |
* | |
* see: https://en.wikipedia.org/wiki/MurmurHash | |
* | |
*/ | |
#hashMurmur3(value: string): number { | |
// Constants are defined for use in the bitwise operations, part of MurmurHash3 specification | |
const c1 = 0xcc9e2d51; | |
const c2 = 0x1b873593; | |
let hash = 0; | |
const chunks = Math.ceil(value.length / 4); | |
for (let c = 0; c < chunks; c++) { | |
let k = 0; | |
for (let i = 0; i < 4; i++) { | |
// Left shift operation for ordering bytes, and OR operation to combine them. | |
if (c * 4 + i < value.length) k |= value.charCodeAt(c * 4 + i) << (i * 8); | |
} | |
// Multiply k by a constant, followed by bitwise rotation and multiplication by another constant. | |
k = k * c1; | |
k = (k << 15) | (k >>> 17); // Bitwise rotation (left by 15, right by 17) | |
k = k * c2; | |
// XOR operation with hash, followed by bitwise rotations and multiplication. | |
hash ^= k; | |
hash = (hash << 13) | (hash >>> 19); // Bitwise rotation (left by 13, right by 19) | |
hash = hash * 5 + 0xe6546b64; | |
} | |
// XOR operation with length of string. | |
hash ^= value.length; | |
// Series of bitwise operations to finalize the hash value. | |
hash ^= hash >>> 16; | |
hash *= 0x85ebca6b; // Multiplication with constant | |
hash ^= hash >>> 13; // XOR after bitwise rotation | |
hash *= 0xc2b2ae35; // Multiplication with constant | |
hash ^= hash >>> 16; // XOR after bitwise rotation | |
// Ensuring the hash value is positive and fits within the size of the bit array. | |
return (hash >>> 0) % this.#size; | |
} | |
/** | |
* FNV-1a hash function. | |
* Created by Glenn Fowler, Landon Curt Noll, and Kiem-Phong Vo in 1991. | |
* | |
* see: http://www.isthe.com/chongo/tech/comp/fnv/index.html | |
* | |
*/ | |
#hashFNV1a(value: string): number { | |
const FNV_prime = 0x01000193; // FNV prime value for 32-bit hash | |
let hash = 0x811c9dc5; // FNV offset basis for 32-bit hash | |
for (let i = 0; i < value.length; i++) { | |
const byte_of_data = value.charCodeAt(i); | |
hash ^= byte_of_data; // XOR byte of data into the bottom of the hash. | |
hash *= FNV_prime; // Multiply by FNV prime to disperse the hash. | |
} | |
return (hash >>> 0) % this.#size; // ensure the hash is unsigned | |
} | |
#isCloseToLimit() { | |
return this.#count / this.#size >= 0.5; // Threshold can be adjusted | |
} | |
/** | |
* Calculates the optimal number of hash functions based on the current size of the bit array | |
* and the desired false positive rate. | |
*/ | |
#getHashPassCount(size: number = this.#size) { | |
// The formula for the optimal number of hash functions is k = (m/n) * ln(2) | |
// However, we need an estimate for n. We'll use the current size as an approximation for n. | |
// This is a simplification and not technically accurate. | |
const n = size * 0.25; // This is an approximation | |
const k = Math.round((size / n) * Math.log(2)); // Calculate the optimal number of hash functions | |
return Math.max(1, k); // Ensure we have at least one hash function | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { fc, test as testFC } from "@fast-check/vitest"; | |
import { webcrypto } from "node:crypto"; | |
import { describe, expect, test } from "vitest"; | |
import { BloomFilter } from "./bloom-filter"; | |
describe("BloomFilter", () => { | |
test("should add and check values correctly", () => { | |
const bf = new BloomFilter(1000); | |
bf.add("test"); | |
expect(bf.has("test")).toBe(true); | |
expect(bf.has("unknown")).toBe(false); | |
}); | |
test("should handle different string lengths", () => { | |
const bf = new BloomFilter(1000); | |
bf.add("a"); | |
bf.add("abcdefghijklmnopqrstuvwxyz"); | |
expect(bf.has("a")).toBe(true); | |
expect(bf.has("abcdefghijklmnopqrstuvwxyz")).toBe(true); | |
}); | |
test("false positive rate should be reasonable", () => { | |
const targetFalsePositiveRate = 0.01; | |
const bf = new BloomFilter(1500, targetFalsePositiveRate); | |
const testData: string[] = []; | |
const falseData: string[] = []; | |
// Generating test data | |
for (let i = 0; i < 1000; i++) { | |
testData.push(webcrypto.randomUUID()); | |
falseData.push(webcrypto.randomUUID()); | |
} | |
// Adding test data to bloom filter | |
testData.forEach((val) => bf.add(val)); | |
// Checking false positive rate | |
let falsePositiveCount = 0; | |
falseData.forEach((val) => { | |
if (bf.has(val)) falsePositiveCount++; | |
}); | |
const falsePositiveRate = falsePositiveCount / falseData.length; | |
expect(falsePositiveRate).toBeLessThanOrEqual(targetFalsePositiveRate); // Adjust the expectation based on your tolerance | |
}); | |
test("should handle a large number of entries", () => { | |
const bf = new BloomFilter(10000); | |
for (let i = 0; i < 10000; i++) { | |
bf.add(`test${i}`); | |
} | |
for (let i = 0; i < 10000; i++) { | |
expect(bf.has(`test${i}`)).toBe(true); | |
} | |
// Checking existence | |
expect(bf.has("test10000")).toBe(false); | |
}); | |
}); | |
describe("BloomFilter Property Tests", () => { | |
testFC.prop([fc.asciiString()])("should handle ASCII strings", (str) => { | |
const bf = new BloomFilter(1000); | |
expect(bf.has(str)).toBe(false); | |
bf.add(str); | |
expect(bf.has(str)).toBe(true); | |
}); | |
testFC.prop([fc.unicodeString()])("should handle Unicode strings", (str) => { | |
const bf = new BloomFilter(1000); | |
expect(bf.has(str)).toBe(false); | |
bf.add(str); | |
expect(bf.has(str)).toBe(true); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment