Skip to content

Instantly share code, notes, and snippets.

@boyter
Created October 29, 2020 02:44
Show Gist options
  • Save boyter/0cffa9ff8e2e7259d455594d744f1164 to your computer and use it in GitHub Desktop.
Save boyter/0cffa9ff8e2e7259d455594d744f1164 to your computer and use it in GitHub Desktop.
Bloom filter example
// Primitive hash function that for a string returns a positive 32 bit int
// Do not use in production, use murmur3 or fnv1
// You can improve this by changing 5 to 31
Object.defineProperty(String.prototype, 'hashCode', {
value: function() {
var hash = 0, i, chr;
for (i = 0; i < this.length; i++) {
chr = this.charCodeAt(i);
hash = ((hash << 5) - hash) + chr;
hash |= 0; // Convert to 32bit integer
}
return Math.abs(hash);
}
});
// start out filter out with 16 0's indicating its empty
// note that this is wasteful because we are using numbers
// ideally you would use boolean or bits to reduce the space
// and using bits allows you to do all sorts of interesting things...
var bloom = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0];
// add some words to the filter using a single hash
// we use % to convert our 32 bit number into one of 16 values
// for the length of our bloom filter
bloom["something".hashCode() % bloom.length] = 1;
bloom["another".hashCode() % bloom.length] = 1;
// lets check if notin is in the filter
if (bloom["notin".hashCode() % bloom.length] == 1) {
console.log("notin may be in set...");
} else {
console.log("notin is not in set");
}
// check if something is in the filter
if (bloom["something".hashCode() % bloom.length] == 1) {
console.log("something may be in set...");
} else {
console.log("something is not in set");
}
// check if boyter is in the filter
// this is a false positive! boyter was never added
if (bloom["boyter".hashCode() % bloom.length] == 1) {
console.log("boyter may be in set... FALSE POSITIVE it was never added");
} else {
console.log("boyter is not in set");
}
// lets reset and try a two hash filter where each term is hashed twice to produce different output
var bloom = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0];
var term = "something";
bloom[(term + "-one").hashCode() % bloom.length] = 1;
bloom[(term + "-two").hashCode() % bloom.length] = 1;
// note that two bits are now set
console.log(bloom);
var term = "another";
bloom[(term + "-one").hashCode() % bloom.length] = 1;
bloom[(term + "-two").hashCode() % bloom.length] = 1;
// note that we can set the same bits
console.log(bloom)
// lets check if boyter still gives us a false positive
var term = "boyter";
var bit1 = (term + "-one").hashCode() % bloom.length;
var bit2 = (term + "-two").hashCode() % bloom.length;
if (bloom[bit1] == 1 && bloom[bit2] == 1) {
console.log("boyter may be in set... FALSE POSITIVE");
} else {
console.log("boyter is not in set");
}
// lets reset and try a two hash filter with a much bigger bloom filter, and then try checking with many tests
var bloom = [];
for (var i=0; i<100; i++) {
bloom.push(0)
}
// add two terms
var term = "something";
bloom[(term + "-one").hashCode() % bloom.length] = 1;
bloom[(term + "-two").hashCode() % bloom.length] = 1;
var term = "another";
bloom[(term + "-one").hashCode() % bloom.length] = 1;
bloom[(term + "-two").hashCode() % bloom.length] = 1;
// lets try 1000 terms and see how many false positives we get
var falsePositive = 0;
for (var i=0; i<1000; i++) {
var term = "" + i;
var bit1 = (term + "-one").hashCode() % bloom.length;
var bit2 = (term + "-two").hashCode() % bloom.length;
if (bloom[bit1] == 1 && bloom[bit2] == 1) {
falsePositive++;
}
}
console.log("falsePositive:", falsePositive);
// lets reset with a much much bigger bloom filter and a lot more terms and see what happens
var bloom = [];
for (var i=0; i<100000; i++) {
bloom.push(0)
}
for (var i=0; i<1000; i++) {
var term = "something" + i;
var bit1 = (term + "-one").hashCode() % bloom.length;
var bit2 = (term + "-two").hashCode() % bloom.length;
bloom[bit1] = 1;
bloom[bit2] = 1;
}
var falsePositive = 0;
for (var i=0; i<1000; i++) {
var term = "" + i;
var bit1 = (term + "-one").hashCode() % bloom.length;
var bit2 = (term + "-two").hashCode() % bloom.length;
if (bloom[bit1] == 1 && bloom[bit2] == 1) {
falsePositive++;
}
}
console.log("falsePositive:", falsePositive);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment