Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@terrancesnyder
Created August 19, 2012 23:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save terrancesnyder/3398489 to your computer and use it in GitHub Desktop.
Save terrancesnyder/3398489 to your computer and use it in GitHub Desktop.
hyperloglog_test.js
/**
* Author - Shutupandcode - Terrance A. Snyder (http://www.shutupandcode.net & http://www.terranceasnyder.com)
* w/ http://stackoverflow.com/users/36174/actual
*
* HyperLogLog implementation ripped from:
* http://stackoverflow.com/questions/5990713/loglog-algorithm-for-counting-of-large-cardinalities
*
* However modifications made to allow for intersect, union, etc
* to allow for comparing two instances together.
*
* Arguments:
* @std_error - Standard error level acceptable for your dataset.
* Defaults to 0.005 for maximum accuracy.
* Depending on set size may want to increase this to ~0.015 or more.
* @hashFunc - The hash function to run against the string.
* Default Implementation currently uses FNV hash - use MurMurHash or other at your discretion.
* @bytes - Optional - the bytes to populate this HyperLogLog with, used when doing intersect, union, etc.
*/
var HyperLogLog = function(std_error, hashFunc, bytes) {
var pow_2_32 = 0xFFFFFFFF + 1;
function fnv1a_hash(text)
{
var hash = 2166136261;
for (var i = 0; i < text.length; ++i)
{
hash ^= text.charCodeAt(i);
hash += (hash << 1) + (hash << 4) + (hash << 7) +
(hash << 8) + (hash << 24);
}
return hash >>> 0;
}
function log2(x)
{
return Math.log(x) / Math.LN2;
}
function rank(hash, max)
{
var r = 1;
while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
return r;
}
// check if hash function supplied if not default to fnv hash
hashFunc = hashFunc ? hashFunc : fnv1a_hash;
var m = 1.04 / std_error;
var k = Math.ceil(log2(m * m));
var k_comp = 32 - k;
m = Math.pow(2, k);
var alpha_m = m == 16 ? 0.673
: m == 32 ? 0.697
: m == 64 ? 0.709
: 0.7213 / (1 + 1.079 / m);
// setup of bitset
var M = [];
if (!bytes) {
for (var i = 0; i < m; ++i) {
M[i] = 0;
}
} else {
M = bytes;
}
function count(stringValue)
{
if (stringValue !== undefined && stringValue)
{
var hash = hashFunc(stringValue);
var j = hash >>> k_comp;
M[j] = Math.max(M[j], rank(hash, k_comp));
}
else
{
var c = 0.0;
for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
var E = alpha_m * m * m / c;
// -- make corrections
if (E <= 5/2 * m)
{
var V = 0;
for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
if (V > 0) E = m * Math.log(m / V);
}
else if (E > 1/30 * pow_2_32)
E = -pow_2_32 * Math.log(1 - E / pow_2_32);
// --
return E;
}
}
return {
/**
* The count function to call with either the word to add to the
* count, or with null to get the total count.
*/
count: count,
/**
* The matrix/set used for cloning and for the intersection, union, etc.
*/
M: M,
/**
* Returns a new HyperLogLog that represents the union
* between all items.
*
* @param {Object} ll The other HyperLogLog to intersect with
*/
union: function(ll) {
if (ll == null) return this;
var mergedBytes = M.slice(0); // clone array
for (var i=0; i<mergedBytes.length;i++) {
mergedBytes[i] = Math.max(mergedBytes[i], ll.M[i]);
}
return HyperLogLog(std_error, hashFunc, mergedBytes);
},
/**
* Returns a new HyperLogLog with only those items that intersect
* @param {Object} ll The other HyperLogLog to intersect with
*/
intersect: function(ll) {
if (ll == null) return this;
var mergedBytes = M.slice(0); // clone array
for (var i=0; i<mergedBytes.length;i++) {
mergedBytes[i] = Math.min(mergedBytes[i], ll.M[i]);
}
return HyperLogLog(std_error, hashFunc, mergedBytes);
},
/**
* A\B means the set that contains all those elements of A that are not in B
* @param {Object} ll The other HyperLogLog to do without.
*/
without: function(ll) {
if (ll == null) return this;
var mergedBytes = M.slice(0); // clone array
for (var i=0; i<mergedBytes.length;i++) {
mergedBytes[i] = mergedBytes[i] > ll.M[i] ? mergedBytes[i] : 0;
}
return HyperLogLog(std_error, hashFunc, mergedBytes);
}
};
};
/**
* JS Implementation of MurmurHash3 (r136) (as of May 20, 2011)
*
* @author <a href="mailto:gary.court@gmail.com">Gary Court</a>
* @see http://github.com/garycourt/murmurhash-js
* @author <a href="mailto:aappleby@gmail.com">Austin Appleby</a>
* @see http://sites.google.com/site/murmurhash/
*
* @param {string} key ASCII only
* @param {number} seed Positive integer only
* @return {number} 32-bit positive integer hash
*/
function murmurhash3_32_gc(key, seed) {
var remainder, bytes, h1, h1b, c1, c1b, c2, c2b, k1, i;
remainder = key.length & 3; // key.length % 4
bytes = key.length - remainder;
h1 = seed ? seed : 1;
c1 = 0xcc9e2d51;
c2 = 0x1b873593;
i = 0;
while (i < bytes) {
k1 =
((key.charCodeAt(i) & 0xff)) |
((key.charCodeAt(++i) & 0xff) << 8) |
((key.charCodeAt(++i) & 0xff) << 16) |
((key.charCodeAt(++i) & 0xff) << 24);
++i;
k1 = ((((k1 & 0xffff) * c1) + ((((k1 >>> 16) * c1) & 0xffff) << 16))) & 0xffffffff;
k1 = (k1 << 15) | (k1 >>> 17);
k1 = ((((k1 & 0xffff) * c2) + ((((k1 >>> 16) * c2) & 0xffff) << 16))) & 0xffffffff;
h1 ^= k1;
h1 = (h1 << 13) | (h1 >>> 19);
h1b = ((((h1 & 0xffff) * 5) + ((((h1 >>> 16) * 5) & 0xffff) << 16))) & 0xffffffff;
h1 = (((h1b & 0xffff) + 0x6b64) + ((((h1b >>> 16) + 0xe654) & 0xffff) << 16));
}
k1 = 0;
switch (remainder) {
case 3: k1 ^= (key.charCodeAt(i + 2) & 0xff) << 16;
case 2: k1 ^= (key.charCodeAt(i + 1) & 0xff) << 8;
case 1: k1 ^= (key.charCodeAt(i) & 0xff);
k1 = (((k1 & 0xffff) * c1) + ((((k1 >>> 16) * c1) & 0xffff) << 16)) & 0xffffffff;
k1 = (k1 << 15) | (k1 >>> 17);
k1 = (((k1 & 0xffff) * c2) + ((((k1 >>> 16) * c2) & 0xffff) << 16)) & 0xffffffff;
h1 ^= k1;
}
h1 ^= key.length;
h1 ^= h1 >>> 16;
h1 = (((h1 & 0xffff) * 0x85ebca6b) + ((((h1 >>> 16) * 0x85ebca6b) & 0xffff) << 16)) & 0xffffffff;
h1 ^= h1 >>> 13;
h1 = ((((h1 & 0xffff) * 0xc2b2ae35) + ((((h1 >>> 16) * 0xc2b2ae35) & 0xffff) << 16))) & 0xffffffff;
h1 ^= h1 >>> 16;
return h1 >>> 0;
};
var setA = new HyperLogLog(0.005, murmurhash3_32_gc);
var setB = new HyperLogLog(0.005, murmurhash3_32_gc);
setA.count('cat');
setA.count('dog');
setA.count('fish');
setB.count('cat');
setB.count('bird');
console.log('setA: ' + setA.count());
console.log('setB: ' + setB.count());
console.log('setA ∪ setB : ' + setA.union(setB).count());
console.log('setA ∩ setB : ' + setA.intersect(setB).count());
console.log('A \\ B: ' + setA.without(setB).count());
console.log('B \\ A: ' + setB.without(setA).count());
function generateRandomWords(count,randomness) {
var result = [];
// randomness of a UUID is cryptographic, such that:
// substring(1) = 1/16, substring(2) = 1/256, substring(3) = 1/4096, etc...
while (count > 0) {
result.push(UUID.randomUUID().substring(0,randomness));
count--;
}
return result;
};
var randomness = 4;
var expected = Math.pow(16,randomness);
var words = generateRandomWords(500000,randomness);
var hll = HyperLogLog(0.015, murmurhash3_32_gc);
var watch = new StopWatch().start();
for (var i=0;i<words.length;i++) {
hll.count(words[i]);
}
var estimate = hll.count().toFixed(0);
var error_rate = (estimate - expected) / (expected / 100.0);
console.log(words.length + ' words with expected uniqueness of ' + expected + ' estimated in ' + watch.stop().total() + 'ms (error rate: ' + error_rate + '%)');
@terrancesnyder
Copy link
Author

== Performance of 100k (1/4096 ramdom) [Chrome]

100000 words with expected uniqueness of 4096 estimated in 137ms (error rate: 0.048828125%) 

@terrancesnyder
Copy link
Author

== Performance of 5 million (1/1,048,576 random) [Chrome]

5000000 words with expected uniqueness of 1048576 estimated in 5732ms (error rate: -0.9981155395507812%)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment