Skip to content

Instantly share code, notes, and snippets.

@PhilipTrauner
Created March 1, 2023 10:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PhilipTrauner/138d1375d3667be94312f97b5bbad3e6 to your computer and use it in GitHub Desktop.
Save PhilipTrauner/138d1375d3667be94312f97b5bbad3e6 to your computer and use it in GitHub Desktop.
deterministic bloom filter
import * as t from "io-ts";
import { createHash } from "node:crypto";
import type { BinaryLike } from "node:crypto";
import { Maybe } from "../../type/utility";
// https://hur.st/bloomfilter
/*
number of items in the filter → `itemCount`
n = ceil(m / (-k / log(1 - exp(log(p) / k))))
probability of false positives → `accuracy`
p = pow(1 - exp(-k / (m / n)), k)
number of bits in filter → `bitCount`
m = ceil((n * log(p)) / log(1 / pow(2, log(2))));
number of hash functions → `hashCount`
k = round((m / n) * log(2));
*/
const bitCount = (itemCount: number, accuracy: number) =>
Math.ceil((itemCount * Math.log(accuracy)) / Math.log(1 / Math.pow(2, Math.log(2))));
const hashCount = (bitCount: number, itemCount: number) =>
Math.round((bitCount / itemCount) * Math.log(2));
type HashPair = {
one: Buffer;
two: Buffer;
}
const hashPair = (item: BinaryLike): HashPair => {
return {
one: createHash("sha256").update(item).digest(),
two: createHash("sha512").update(item).digest()
}
}
// http://peterd.org/pcd-diss.pdf (6.5.4)
/**
* @param n - index of the hash function
* @param hashOne
* @param hashTwo
* @param bitCount
*/
const doubleHashing = (
n: number,
hashOne: number,
hashTwo: number,
bitCount: number
): number => {
return Math.abs(
(hashOne + n * hashTwo + Math.floor((n ** 3 - n) / 6)) % bitCount
)
}
const getIndices = (
item: BinaryLike,
hashCount: number,
bitCount: number
): number[] => {
const pair = hashPair(item)
const hashes = {
one: pair.one.readUIntBE(0, 6),
two: pair.two.readUIntBE(0, 6)
}
const indices = []
for (let index = 0; index < hashCount; index++) {
const doubleHashed = doubleHashing(
index,
hashes.one,
hashes.two,
bitCount
)
indices.push(doubleHashed)
}
return indices;
};
const BloomFilterParameters = t.type({
bitCount: t.number,
hashCount: t.number
})
type BloomFilterParameters = t.TypeOf<typeof BloomFilterParameters>;
export const EncodedSealedBloomFilter = t.type({
words: t.string,
parameters: BloomFilterParameters
})
export type EncodedSealedBloomFilter = t.TypeOf<typeof EncodedSealedBloomFilter>;
class SealedBloomFilter {
constructor(
private words: ReturnType<typeof BloomFilter["fill"]>,
private parameters: BloomFilterParameters
) {}
has(item: BinaryLike): boolean {
for (
const index of getIndices(
item,
this.parameters.hashCount,
this.parameters.bitCount
)
) {
const wordIndex = Math.floor(index / 8)
const mask = 1 << index % 8
if ((this.words[wordIndex] & mask) === 0) {
return false;
}
}
return true;
}
encode(): EncodedSealedBloomFilter {
return {
words: Buffer.from(this.words).toString('base64'),
parameters: this.parameters
}
}
}
export class BloomFilter {
private words: Uint8Array;
constructor(
private parameters: BloomFilterParameters
) {
this.words = BloomFilter.fill(parameters.bitCount);
}
private static fill(bitCount: number) {
const bitsPerWord = Uint8Array.BYTES_PER_ELEMENT * 8;
const diff = bitsPerWord - (bitCount % bitsPerWord);
const size = bitCount + ([0, bitsPerWord].includes(diff) ? 0 : diff)
return new Uint8Array(Math.ceil(size / bitsPerWord))
}
add(item: BinaryLike): void {
for (
const index of getIndices(
item,
this.parameters.hashCount,
this.parameters.bitCount
)
) {
const wordIndex = Math.floor(index / 8)
const mask = 1 << index % 8
this.words[wordIndex] = this.words[wordIndex] | mask
}
}
/**
* @param items
* @param accuracy - desired false positive rate as fraction
*/
static from(
items: Set<BinaryLike>,
accuracy: number,
) {
const _bitCount = bitCount(items.size, accuracy);
const parameters: BloomFilterParameters = {
bitCount: _bitCount,
hashCount: hashCount(_bitCount, items.size)
}
const filter = new BloomFilter(parameters);
items.forEach(item => filter.add(item));
return filter.seal();
}
static decode(encoded: EncodedSealedBloomFilter): Maybe<SealedBloomFilter> {
let buffer;
try {
buffer = Buffer.from(encoded.words, 'base64')
} catch {
return null;
}
const words = new Uint8Array(buffer);
return new SealedBloomFilter(words, encoded.parameters);
}
seal(): SealedBloomFilter {
return new SealedBloomFilter(
new Uint8Array(this.words),
{ ...this.parameters }
);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment