shuntksh/bloom-filter.ts

## bloom-filter.ts
/**
 * BloomFilter is a probabilistic data structure used to test whether an element
 * is a member of a set, allowing for false positives.
 */
export class BloomFilter {
  static readonly DEFAULT_BIT_LEN = 8;

  #size: number;
  #bitArray: Uint8Array; // Uint8Array might be slower than BigInt64Array but easier to debug
  #hashPassCount: number;
  #count = 0;

  constructor(expectedItems: number, falsePositiveRate: number = 0.01) {
    // Calculate the size of the bit array (m)
    // then find the next power of two for the size to avoid modulo bias.
    const _size = Math.ceil((-expectedItems * Math.log(falsePositiveRate)) / Math.pow(Math.log(2), 2));
    this.#size = Math.pow(2, Math.ceil(Math.log(_size) / Math.log(2)));
    this.#hashPassCount = this.#getHashPassCount(this.#size);
    this.#bitArray = new Uint8Array(Math.ceil(this.#size / BloomFilter.DEFAULT_BIT_LEN));
    console.log(this.#size, this.#hashPassCount);
  }

  get count() {
    return this.#count;
  }

  getBitArray() {
    return new Uint8Array(this.#bitArray);
  }

  /**
   * Add a value to the BloomFilter.
   */
  add(value: string) {
    if (this.has(value)) return;
    if (this.#isCloseToLimit()) this.resize();
    for (let i = 0; i < this.#hashPassCount; i++) {
      const hash = this.#combinedHash(value, i);
      this.#setBit(hash);
    }
    this.#count++;
  }

  /**
   * Check whether a value is possibly in the BloomFilter.
   */
  has(value: string): boolean {
    for (let i = 0; i < this.#hashPassCount; i++) {
      const hash = this.#combinedHash(value, i);
      if (!this.#getBit(hash)) return false;
    }
    return true;
  }

  /**
   * Resizes the bit array and increases the number of hash functions.
   * This is done by doubling the size of the bit array and recalculating the optimal hash count.
   */
  resize() {
    const newSize = this.#size * 2;
    const newBitArray = new Uint8Array(newSize);
    newBitArray.set(this.#bitArray);

    this.#size = newSize;
    this.#bitArray = newBitArray;
    this.#hashPassCount = this.#getHashPassCount();
  }

  /**
   * Gets the bit value at a specific index in the Bloom filter.
   */
  #getBit(index: number): boolean {
    const byteIndex = Math.floor(index / BloomFilter.DEFAULT_BIT_LEN);
    const bitIndex = index % BloomFilter.DEFAULT_BIT_LEN;
    return (this.#bitArray[byteIndex] & (1 << bitIndex)) !== 0;
  }

  /**
   * Sets a bit to 1 at a specific index in the Bloom filter.
   */
  #setBit(index: number) {
    const byteIndex = Math.floor(index / BloomFilter.DEFAULT_BIT_LEN);
    const bitIndex = index % BloomFilter.DEFAULT_BIT_LEN;
    this.#bitArray[byteIndex] |= 1 << bitIndex;
  }

  #combinedHash(value: string, salt: number): number {
    const combinedValue = value + salt.toString();
    const hash1 = this.#hashDJB2(combinedValue);
    const hash2 = this.#hashSDBM(combinedValue);
    const hash3 = this.#hashMurmur3(combinedValue);
    const hash4 = this.#hashFNV1a(combinedValue);
    return ((hash1 ^ hash2 ^ hash3 ^ hash4) >>> 0) % this.#size;
  }

  /**
   * DJB2 hashing function.
   * A simple and effective string hashing algorithm. It’s used as one of the
   * hashing functions in the Bloom filter, assisting in minimizing the
   * likelihood of collision.
   *
   * See: http://www.cse.yorku.ca/~oz/hash.html
   *
   */
  #hashDJB2(value: string): number {
    // Magic constant helps in reducing the chance of collision
    let hash = 5381;
    // Bitwise XOR and multiplication by prime number (33) to scramble the bits.
    for (let i = 0; i < value.length; i++) hash = (hash * 33) ^ value.charCodeAt(i);
    // Ensure a positive number within the range of the bit array size.
    return (hash >>> 0) % this.#size;
  }

  /**
   * SDBM hashing function.
   * A hash function suitable for hash table lookup. It’s used within the Bloom filter
   * to create hash values, making it able to ascertain element membership.
   *
   * see: http://www.cse.yorku.ca/~oz/hash.html
   *
   */
  #hashSDBM(value: string): number {
    let hash = 0;
    // Combination of bitwise operations and arithmetic to scramble the bits.
    for (let i = 0; i < value.length; i++) hash = value.charCodeAt(i) + (hash << 6) + (hash << 16) - hash;
    // Ensure a positive number within the range of the bit array size.
    return (hash >>> 0) % this.#size;
  }

  /**
   * MurmurHash3 hashing function.
   * This is a non-cryptographic, fast hash function. It takes in a string value
   * and produces a 32-bit hash. The function is utilized to generate hash values
   * to be used in the Bloom filter, allowing it to decide the presence of an element.
   *
   * see: https://en.wikipedia.org/wiki/MurmurHash
   *
   */
  #hashMurmur3(value: string): number {
    // Constants are defined for use in the bitwise operations, part of MurmurHash3 specification
    const c1 = 0xcc9e2d51;
    const c2 = 0x1b873593;
    let hash = 0;

    const chunks = Math.ceil(value.length / 4);
    for (let c = 0; c < chunks; c++) {
      let k = 0;
      for (let i = 0; i < 4; i++) {
        // Left shift operation for ordering bytes, and OR operation to combine them.
        if (c * 4 + i < value.length) k |= value.charCodeAt(c * 4 + i) << (i * 8);
      }
      // Multiply k by a constant, followed by bitwise rotation and multiplication by another constant.
      k = k * c1;
      k = (k << 15) | (k >>> 17); // Bitwise rotation (left by 15, right by 17)
      k = k * c2;
      // XOR operation with hash, followed by bitwise rotations and multiplication.
      hash ^= k;
      hash = (hash << 13) | (hash >>> 19); // Bitwise rotation (left by 13, right by 19)
      hash = hash * 5 + 0xe6546b64;
    }

    // XOR operation with length of string.
    hash ^= value.length;

    // Series of bitwise operations to finalize the hash value.
    hash ^= hash >>> 16;
    hash *= 0x85ebca6b; // Multiplication with constant
    hash ^= hash >>> 13; // XOR after bitwise rotation
    hash *= 0xc2b2ae35; // Multiplication with constant
    hash ^= hash >>> 16; // XOR after bitwise rotation

    // Ensuring the hash value is positive and fits within the size of the bit array.
    return (hash >>> 0) % this.#size;
  }

  /**
   * FNV-1a hash function.
   * Created by Glenn Fowler, Landon Curt Noll, and Kiem-Phong Vo in 1991.
   *
   * see: http://www.isthe.com/chongo/tech/comp/fnv/index.html
   *
   */
  #hashFNV1a(value: string): number {
    const FNV_prime = 0x01000193; // FNV prime value for 32-bit hash
    let hash = 0x811c9dc5; // FNV offset basis for 32-bit hash
    for (let i = 0; i < value.length; i++) {
      const byte_of_data = value.charCodeAt(i);
      hash ^= byte_of_data; // XOR byte of data into the bottom of the hash.
      hash *= FNV_prime; // Multiply by FNV prime to disperse the hash.
    }
    return (hash >>> 0) % this.#size; // ensure the hash is unsigned
  }

  #isCloseToLimit() {
    return this.#count / this.#size >= 0.5; // Threshold can be adjusted
  }

  /**
   * Calculates the optimal number of hash functions based on the current size of the bit array
   * and the desired false positive rate.
   */
  #getHashPassCount(size: number = this.#size) {
    // The formula for the optimal number of hash functions is k = (m/n) * ln(2)
    // However, we need an estimate for n. We'll use the current size as an approximation for n.
    // This is a simplification and not technically accurate.
    const n = size * 0.25; // This is an approximation
    const k = Math.round((size / n) * Math.log(2)); // Calculate the optimal number of hash functions
    return Math.max(1, k); // Ensure we have at least one hash function
  }
}

## test.ts
import { fc, test as testFC } from "@fast-check/vitest";
import { webcrypto } from "node:crypto";
import { describe, expect, test } from "vitest";
import { BloomFilter } from "./bloom-filter";

describe("BloomFilter", () => {
  test("should add and check values correctly", () => {
    const bf = new BloomFilter(1000);
    bf.add("test");
    expect(bf.has("test")).toBe(true);
    expect(bf.has("unknown")).toBe(false);
  });

  test("should handle different string lengths", () => {
    const bf = new BloomFilter(1000);
    bf.add("a");
    bf.add("abcdefghijklmnopqrstuvwxyz");
    expect(bf.has("a")).toBe(true);
    expect(bf.has("abcdefghijklmnopqrstuvwxyz")).toBe(true);
  });

  test("false positive rate should be reasonable", () => {
    const targetFalsePositiveRate = 0.01;
    const bf = new BloomFilter(1500, targetFalsePositiveRate);
    const testData: string[] = [];
    const falseData: string[] = [];

    // Generating test data
    for (let i = 0; i < 1000; i++) {
      testData.push(webcrypto.randomUUID());
      falseData.push(webcrypto.randomUUID());
    }

    // Adding test data to bloom filter
    testData.forEach((val) => bf.add(val));

    // Checking false positive rate
    let falsePositiveCount = 0;
    falseData.forEach((val) => {
      if (bf.has(val)) falsePositiveCount++;
    });

    const falsePositiveRate = falsePositiveCount / falseData.length;
    expect(falsePositiveRate).toBeLessThanOrEqual(targetFalsePositiveRate); // Adjust the expectation based on your tolerance
  });

  test("should handle a large number of entries", () => {
    const bf = new BloomFilter(10000);
    for (let i = 0; i < 10000; i++) {
      bf.add(`test${i}`);
    }

    for (let i = 0; i < 10000; i++) {
      expect(bf.has(`test${i}`)).toBe(true);
    }

    // Checking existence
    expect(bf.has("test10000")).toBe(false);
  });
});

describe("BloomFilter Property Tests", () => {
  testFC.prop([fc.asciiString()])("should handle ASCII strings", (str) => {
    const bf = new BloomFilter(1000);
    expect(bf.has(str)).toBe(false);
    bf.add(str);
    expect(bf.has(str)).toBe(true);
  });

  testFC.prop([fc.unicodeString()])("should handle Unicode strings", (str) => {
    const bf = new BloomFilter(1000);
    expect(bf.has(str)).toBe(false);
    bf.add(str);
    expect(bf.has(str)).toBe(true);
  });
});
	/**
	* BloomFilter is a probabilistic data structure used to test whether an element
	* is a member of a set, allowing for false positives.
	*/
	export class BloomFilter {
	static readonly DEFAULT_BIT_LEN = 8;

	#size: number;
	#bitArray: Uint8Array; // Uint8Array might be slower than BigInt64Array but easier to debug
	#hashPassCount: number;
	#count = 0;

	constructor(expectedItems: number, falsePositiveRate: number = 0.01) {
	// Calculate the size of the bit array (m)
	// then find the next power of two for the size to avoid modulo bias.
	const _size = Math.ceil((-expectedItems * Math.log(falsePositiveRate)) / Math.pow(Math.log(2), 2));
	this.#size = Math.pow(2, Math.ceil(Math.log(_size) / Math.log(2)));
	this.#hashPassCount = this.#getHashPassCount(this.#size);
	this.#bitArray = new Uint8Array(Math.ceil(this.#size / BloomFilter.DEFAULT_BIT_LEN));
	console.log(this.#size, this.#hashPassCount);
	}

	get count() {
	return this.#count;
	}

	getBitArray() {
	return new Uint8Array(this.#bitArray);
	}

	/**
	* Add a value to the BloomFilter.
	*/
	add(value: string) {
	if (this.has(value)) return;
	if (this.#isCloseToLimit()) this.resize();
	for (let i = 0; i < this.#hashPassCount; i++) {
	const hash = this.#combinedHash(value, i);
	this.#setBit(hash);
	}
	this.#count++;
	}

	/**
	* Check whether a value is possibly in the BloomFilter.
	*/
	has(value: string): boolean {
	for (let i = 0; i < this.#hashPassCount; i++) {
	const hash = this.#combinedHash(value, i);
	if (!this.#getBit(hash)) return false;
	}
	return true;
	}

	/**
	* Resizes the bit array and increases the number of hash functions.
	* This is done by doubling the size of the bit array and recalculating the optimal hash count.
	*/
	resize() {
	const newSize = this.#size * 2;
	const newBitArray = new Uint8Array(newSize);
	newBitArray.set(this.#bitArray);

	this.#size = newSize;
	this.#bitArray = newBitArray;
	this.#hashPassCount = this.#getHashPassCount();
	}

	/**
	* Gets the bit value at a specific index in the Bloom filter.
	*/
	#getBit(index: number): boolean {
	const byteIndex = Math.floor(index / BloomFilter.DEFAULT_BIT_LEN);
	const bitIndex = index % BloomFilter.DEFAULT_BIT_LEN;
	return (this.#bitArray[byteIndex] & (1 << bitIndex)) !== 0;
	}

	/**
	* Sets a bit to 1 at a specific index in the Bloom filter.
	*/
	#setBit(index: number) {
	const byteIndex = Math.floor(index / BloomFilter.DEFAULT_BIT_LEN);
	const bitIndex = index % BloomFilter.DEFAULT_BIT_LEN;
	this.#bitArray[byteIndex] \|= 1 << bitIndex;
	}

	#combinedHash(value: string, salt: number): number {
	const combinedValue = value + salt.toString();
	const hash1 = this.#hashDJB2(combinedValue);
	const hash2 = this.#hashSDBM(combinedValue);
	const hash3 = this.#hashMurmur3(combinedValue);
	const hash4 = this.#hashFNV1a(combinedValue);
	return ((hash1 ^ hash2 ^ hash3 ^ hash4) >>> 0) % this.#size;
	}

	/**
	* DJB2 hashing function.
	* A simple and effective string hashing algorithm. It’s used as one of the
	* hashing functions in the Bloom filter, assisting in minimizing the
	* likelihood of collision.
	*
	* See: http://www.cse.yorku.ca/~oz/hash.html
	*
	*/
	#hashDJB2(value: string): number {
	// Magic constant helps in reducing the chance of collision
	let hash = 5381;
	// Bitwise XOR and multiplication by prime number (33) to scramble the bits.
	for (let i = 0; i < value.length; i++) hash = (hash * 33) ^ value.charCodeAt(i);
	// Ensure a positive number within the range of the bit array size.
	return (hash >>> 0) % this.#size;
	}

	/**
	* SDBM hashing function.
	* A hash function suitable for hash table lookup. It’s used within the Bloom filter
	* to create hash values, making it able to ascertain element membership.
	*
	* see: http://www.cse.yorku.ca/~oz/hash.html
	*
	*/
	#hashSDBM(value: string): number {
	let hash = 0;
	// Combination of bitwise operations and arithmetic to scramble the bits.
	for (let i = 0; i < value.length; i++) hash = value.charCodeAt(i) + (hash << 6) + (hash << 16) - hash;
	// Ensure a positive number within the range of the bit array size.
	return (hash >>> 0) % this.#size;
	}

	/**
	* MurmurHash3 hashing function.
	* This is a non-cryptographic, fast hash function. It takes in a string value
	* and produces a 32-bit hash. The function is utilized to generate hash values
	* to be used in the Bloom filter, allowing it to decide the presence of an element.
	*
	* see: https://en.wikipedia.org/wiki/MurmurHash
	*
	*/
	#hashMurmur3(value: string): number {
	// Constants are defined for use in the bitwise operations, part of MurmurHash3 specification
	const c1 = 0xcc9e2d51;
	const c2 = 0x1b873593;
	let hash = 0;

	const chunks = Math.ceil(value.length / 4);
	for (let c = 0; c < chunks; c++) {
	let k = 0;
	for (let i = 0; i < 4; i++) {
	// Left shift operation for ordering bytes, and OR operation to combine them.
	if (c * 4 + i < value.length) k \|= value.charCodeAt(c * 4 + i) << (i * 8);
	}
	// Multiply k by a constant, followed by bitwise rotation and multiplication by another constant.
	k = k * c1;
	k = (k << 15) \| (k >>> 17); // Bitwise rotation (left by 15, right by 17)
	k = k * c2;
	// XOR operation with hash, followed by bitwise rotations and multiplication.
	hash ^= k;
	hash = (hash << 13) \| (hash >>> 19); // Bitwise rotation (left by 13, right by 19)
	hash = hash * 5 + 0xe6546b64;
	}

	// XOR operation with length of string.
	hash ^= value.length;

	// Series of bitwise operations to finalize the hash value.
	hash ^= hash >>> 16;
	hash *= 0x85ebca6b; // Multiplication with constant
	hash ^= hash >>> 13; // XOR after bitwise rotation
	hash *= 0xc2b2ae35; // Multiplication with constant
	hash ^= hash >>> 16; // XOR after bitwise rotation

	// Ensuring the hash value is positive and fits within the size of the bit array.
	return (hash >>> 0) % this.#size;
	}

	/**
	* FNV-1a hash function.
	* Created by Glenn Fowler, Landon Curt Noll, and Kiem-Phong Vo in 1991.
	*
	* see: http://www.isthe.com/chongo/tech/comp/fnv/index.html
	*
	*/
	#hashFNV1a(value: string): number {
	const FNV_prime = 0x01000193; // FNV prime value for 32-bit hash
	let hash = 0x811c9dc5; // FNV offset basis for 32-bit hash
	for (let i = 0; i < value.length; i++) {
	const byte_of_data = value.charCodeAt(i);
	hash ^= byte_of_data; // XOR byte of data into the bottom of the hash.
	hash *= FNV_prime; // Multiply by FNV prime to disperse the hash.
	}
	return (hash >>> 0) % this.#size; // ensure the hash is unsigned
	}

	#isCloseToLimit() {
	return this.#count / this.#size >= 0.5; // Threshold can be adjusted
	}

	/**
	* Calculates the optimal number of hash functions based on the current size of the bit array
	* and the desired false positive rate.
	*/
	#getHashPassCount(size: number = this.#size) {
	// The formula for the optimal number of hash functions is k = (m/n) * ln(2)
	// However, we need an estimate for n. We'll use the current size as an approximation for n.
	// This is a simplification and not technically accurate.
	const n = size * 0.25; // This is an approximation
	const k = Math.round((size / n) * Math.log(2)); // Calculate the optimal number of hash functions
	return Math.max(1, k); // Ensure we have at least one hash function
	}
	}
	import { fc, test as testFC } from "@fast-check/vitest";
	import { webcrypto } from "node:crypto";
	import { describe, expect, test } from "vitest";
	import { BloomFilter } from "./bloom-filter";

	describe("BloomFilter", () => {
	test("should add and check values correctly", () => {
	const bf = new BloomFilter(1000);
	bf.add("test");
	expect(bf.has("test")).toBe(true);
	expect(bf.has("unknown")).toBe(false);
	});

	test("should handle different string lengths", () => {
	const bf = new BloomFilter(1000);
	bf.add("a");
	bf.add("abcdefghijklmnopqrstuvwxyz");
	expect(bf.has("a")).toBe(true);
	expect(bf.has("abcdefghijklmnopqrstuvwxyz")).toBe(true);
	});

	test("false positive rate should be reasonable", () => {
	const targetFalsePositiveRate = 0.01;
	const bf = new BloomFilter(1500, targetFalsePositiveRate);
	const testData: string[] = [];
	const falseData: string[] = [];

	// Generating test data
	for (let i = 0; i < 1000; i++) {
	testData.push(webcrypto.randomUUID());
	falseData.push(webcrypto.randomUUID());
	}

	// Adding test data to bloom filter
	testData.forEach((val) => bf.add(val));

	// Checking false positive rate
	let falsePositiveCount = 0;
	falseData.forEach((val) => {
	if (bf.has(val)) falsePositiveCount++;
	});

	const falsePositiveRate = falsePositiveCount / falseData.length;
	expect(falsePositiveRate).toBeLessThanOrEqual(targetFalsePositiveRate); // Adjust the expectation based on your tolerance
	});

	test("should handle a large number of entries", () => {
	const bf = new BloomFilter(10000);
	for (let i = 0; i < 10000; i++) {
	bf.add(`test${i}`);
	}

	for (let i = 0; i < 10000; i++) {
	expect(bf.has(`test${i}`)).toBe(true);
	}

	// Checking existence
	expect(bf.has("test10000")).toBe(false);
	});
	});

	describe("BloomFilter Property Tests", () => {
	testFC.prop([fc.asciiString()])("should handle ASCII strings", (str) => {
	const bf = new BloomFilter(1000);
	expect(bf.has(str)).toBe(false);
	bf.add(str);
	expect(bf.has(str)).toBe(true);
	});

	testFC.prop([fc.unicodeString()])("should handle Unicode strings", (str) => {
	const bf = new BloomFilter(1000);
	expect(bf.has(str)).toBe(false);
	bf.add(str);
	expect(bf.has(str)).toBe(true);
	});
	});