Created
March 1, 2023 10:53
-
-
Save PhilipTrauner/138d1375d3667be94312f97b5bbad3e6 to your computer and use it in GitHub Desktop.
deterministic bloom filter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as t from "io-ts"; | |
import { createHash } from "node:crypto"; | |
import type { BinaryLike } from "node:crypto"; | |
import { Maybe } from "../../type/utility"; | |
// https://hur.st/bloomfilter | |
/* | |
number of items in the filter → `itemCount` | |
n = ceil(m / (-k / log(1 - exp(log(p) / k)))) | |
probability of false positives → `accuracy` | |
p = pow(1 - exp(-k / (m / n)), k) | |
number of bits in filter → `bitCount` | |
m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); | |
number of hash functions → `hashCount` | |
k = round((m / n) * log(2)); | |
*/ | |
const bitCount = (itemCount: number, accuracy: number) => | |
Math.ceil((itemCount * Math.log(accuracy)) / Math.log(1 / Math.pow(2, Math.log(2)))); | |
const hashCount = (bitCount: number, itemCount: number) => | |
Math.round((bitCount / itemCount) * Math.log(2)); | |
type HashPair = { | |
one: Buffer; | |
two: Buffer; | |
} | |
const hashPair = (item: BinaryLike): HashPair => { | |
return { | |
one: createHash("sha256").update(item).digest(), | |
two: createHash("sha512").update(item).digest() | |
} | |
} | |
// http://peterd.org/pcd-diss.pdf (6.5.4) | |
/** | |
* @param n - index of the hash function | |
* @param hashOne | |
* @param hashTwo | |
* @param bitCount | |
*/ | |
const doubleHashing = ( | |
n: number, | |
hashOne: number, | |
hashTwo: number, | |
bitCount: number | |
): number => { | |
return Math.abs( | |
(hashOne + n * hashTwo + Math.floor((n ** 3 - n) / 6)) % bitCount | |
) | |
} | |
const getIndices = ( | |
item: BinaryLike, | |
hashCount: number, | |
bitCount: number | |
): number[] => { | |
const pair = hashPair(item) | |
const hashes = { | |
one: pair.one.readUIntBE(0, 6), | |
two: pair.two.readUIntBE(0, 6) | |
} | |
const indices = [] | |
for (let index = 0; index < hashCount; index++) { | |
const doubleHashed = doubleHashing( | |
index, | |
hashes.one, | |
hashes.two, | |
bitCount | |
) | |
indices.push(doubleHashed) | |
} | |
return indices; | |
}; | |
const BloomFilterParameters = t.type({ | |
bitCount: t.number, | |
hashCount: t.number | |
}) | |
type BloomFilterParameters = t.TypeOf<typeof BloomFilterParameters>; | |
export const EncodedSealedBloomFilter = t.type({ | |
words: t.string, | |
parameters: BloomFilterParameters | |
}) | |
export type EncodedSealedBloomFilter = t.TypeOf<typeof EncodedSealedBloomFilter>; | |
class SealedBloomFilter { | |
constructor( | |
private words: ReturnType<typeof BloomFilter["fill"]>, | |
private parameters: BloomFilterParameters | |
) {} | |
has(item: BinaryLike): boolean { | |
for ( | |
const index of getIndices( | |
item, | |
this.parameters.hashCount, | |
this.parameters.bitCount | |
) | |
) { | |
const wordIndex = Math.floor(index / 8) | |
const mask = 1 << index % 8 | |
if ((this.words[wordIndex] & mask) === 0) { | |
return false; | |
} | |
} | |
return true; | |
} | |
encode(): EncodedSealedBloomFilter { | |
return { | |
words: Buffer.from(this.words).toString('base64'), | |
parameters: this.parameters | |
} | |
} | |
} | |
export class BloomFilter { | |
private words: Uint8Array; | |
constructor( | |
private parameters: BloomFilterParameters | |
) { | |
this.words = BloomFilter.fill(parameters.bitCount); | |
} | |
private static fill(bitCount: number) { | |
const bitsPerWord = Uint8Array.BYTES_PER_ELEMENT * 8; | |
const diff = bitsPerWord - (bitCount % bitsPerWord); | |
const size = bitCount + ([0, bitsPerWord].includes(diff) ? 0 : diff) | |
return new Uint8Array(Math.ceil(size / bitsPerWord)) | |
} | |
add(item: BinaryLike): void { | |
for ( | |
const index of getIndices( | |
item, | |
this.parameters.hashCount, | |
this.parameters.bitCount | |
) | |
) { | |
const wordIndex = Math.floor(index / 8) | |
const mask = 1 << index % 8 | |
this.words[wordIndex] = this.words[wordIndex] | mask | |
} | |
} | |
/** | |
* @param items | |
* @param accuracy - desired false positive rate as fraction | |
*/ | |
static from( | |
items: Set<BinaryLike>, | |
accuracy: number, | |
) { | |
const _bitCount = bitCount(items.size, accuracy); | |
const parameters: BloomFilterParameters = { | |
bitCount: _bitCount, | |
hashCount: hashCount(_bitCount, items.size) | |
} | |
const filter = new BloomFilter(parameters); | |
items.forEach(item => filter.add(item)); | |
return filter.seal(); | |
} | |
static decode(encoded: EncodedSealedBloomFilter): Maybe<SealedBloomFilter> { | |
let buffer; | |
try { | |
buffer = Buffer.from(encoded.words, 'base64') | |
} catch { | |
return null; | |
} | |
const words = new Uint8Array(buffer); | |
return new SealedBloomFilter(words, encoded.parameters); | |
} | |
seal(): SealedBloomFilter { | |
return new SealedBloomFilter( | |
new Uint8Array(this.words), | |
{ ...this.parameters } | |
); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment