Skip to content

Instantly share code, notes, and snippets.

@VorticonCmdr
Created December 4, 2012 16:55
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save VorticonCmdr/4206141 to your computer and use it in GitHub Desktop.
Save VorticonCmdr/4206141 to your computer and use it in GitHub Desktop.
hyperloglog for elasticsearch script facet plugin (alpha code)
var HASH_LENGTH = 32,
HASH_K = 5;
function hash(str) {
var hash = 0;
for (var i = 0, l = str.length; i < l; i++) {
hash += str.charCodeAt(i);
hash += hash << 10;
hash ^= hash >> 6;
}
hash += hash << 3;
hash ^= hash >> 6;
hash += hash << 16;
return hash;
}
function scan1(bites) {
if (bites == 0) {
return HASH_LENGTH - HASH_K;
}
var offset = parseInt(Math.log(bites) / Math.log(2));
offset = HASH_LENGTH - HASH_K - offset;
return offset;
}
function getBites(bites, start, end) {
var r = bites >> (HASH_LENGTH - end);
r = r & (Math.pow(2, end - start + 1) - 1);
return r;
}
var k = doc[key].value;
if (facet[k] == null || typeof facet[k] == 'undefined') {
facet[k] = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]; //HASH_LENGTH
}
var h = hash(doc[value].value),
j = getBites(h, 1, HASH_K) + 1,
w = getBites(h, HASH_K + 1, HASH_LENGTH);
w = scan1(w);
if (typeof facet[k][j] == 'undefined' || facet[k][j] < w) {
facet[k][j] = w;
}
var HASH_LENGTH = 32, // bites
HASH_K = 5; // HASH_LENGTH = 2 ^ HASH_K
var alpha = 0.697122946; // 1 / (32 * integral(0,inf)( (log2(1+1/(1+x)))^32 dx))
function estimate(M) {
var Z = 0;
for (var i = 1; i <= HASH_LENGTH; i++) {
if (typeof M[i] != 'undefined' && M[i] != 0) {
Z += 1 / Math.pow(2, M[i]);
} else {
Z += 1;
}
}
Z = alpha * HASH_LENGTH * HASH_LENGTH / Z;
return parseInt(Z);
}
function BinaryHeap(scoreFunction){
this.content = [];
this.scoreFunction = scoreFunction;
}
BinaryHeap.prototype = {
push: function(element) {
// Add the new element to the end of the array.
this.content.push(element);
// Allow it to bubble up.
this.bubbleUp(this.content.length - 1);
},
pop: function() {
// Store the first element so we can return it later.
var result = this.content[0];
// Get the element at the end of the array.
var end = this.content.pop();
// If there are any elements left, put the end element at the
// start, and let it sink down.
if (this.content.length > 0) {
this.content[0] = end;
this.sinkDown(0);
}
return result;
},
remove: function(node) {
var len = this.content.length;
// To remove a value, we must search through the array to find
// it.
for (var i = 0; i < len; i++) {
if (this.content[i] == node) {
// When it is found, the process seen in 'pop' is repeated
// to fill up the hole.
var end = this.content.pop();
if (i != len - 1) {
this.content[i] = end;
if (this.scoreFunction(end) < this.scoreFunction(node))
this.bubbleUp(i);
else
this.sinkDown(i);
}
return;
}
}
throw new Error("Node not found.");
},
size: function() {
return this.content.length;
},
bubbleUp: function(n) {
// Fetch the element that has to be moved.
var element = this.content[n];
// When at 0, an element can not go up any further.
while (n > 0) {
// Compute the parent element's index, and fetch it.
var parentN = Math.floor((n + 1) / 2) - 1,
parent = this.content[parentN];
// Swap the elements if the parent is greater.
if (this.scoreFunction(element) < this.scoreFunction(parent)) {
this.content[parentN] = element;
this.content[n] = parent;
// Update 'n' to continue at the new position.
n = parentN;
}
// Found a parent that is less, no need to move it further.
else {
break;
}
}
},
sinkDown: function(n) {
// Look up the target element and its score.
var length = this.content.length,
element = this.content[n],
elemScore = this.scoreFunction(element);
while(true) {
// Compute the indices of the child elements.
var child2N = (n + 1) * 2, child1N = child2N - 1;
// This is used to store the new position of the element,
// if any.
var swap = null;
// If the first child exists (is inside the array)...
if (child1N < length) {
// Look it up and compute its score.
var child1 = this.content[child1N],
child1Score = this.scoreFunction(child1);
// If the score is less than our element's, we need to swap.
if (child1Score < elemScore)
swap = child1N;
}
// Do the same checks for the other child.
if (child2N < length) {
var child2 = this.content[child2N],
child2Score = this.scoreFunction(child2);
if (child2Score < (swap == null ? elemScore : child1Score))
swap = child2N;
}
// If the element needs to be moved, swap it, and continue.
if (swap != null) {
this.content[n] = this.content[swap];
this.content[swap] = element;
n = swap;
}
// Otherwise, we are done.
else {
break;
}
}
}
};
var l = facets.length;
var d = facets[0];
for (var i=1; i<l; i++) {
var f = facets[i];
for (k in f) {
if ((typeof d[k] != 'undefined') && (d[k] != null)) {
var M = d[k];
var N = f[k];
for (var j=0; j<N.length; j++) {
if (M[j] < N[j]) {
M[j] = N[j];
}
}
d[k] = M;
}
else {
d[k] = f[k];
}
}
}
var size = 10;
var heap = new BinaryHeap(function(x){return x;});
var v = 0;
var h = {};
for (k in d) {
v = estimate(d[k]);
h = {};
h['url'] = k;
h['count'] = v;
if ((heap.size() < size) || (v > heap.content[0]['count'])) {
if (heap.size() == size) {
heap.pop();
}
heap.push(h);
}
}
facet = heap.content;
prerequistes:
https://github.com/imotov/elasticsearch-facet-script
https://github.com/elasticsearch/elasticsearch-lang-javascript
"facets": {
"facet1": {
"script": {
"map_script": "hll_map",
"reduce_script" : "my_reduce",
"params" : {
"facet" : {},
"key" : "term",
"value": "unique"
}
}
}
}
@redox
Copy link

redox commented Feb 22, 2014

Hey there, it has been a while since you posted this gist. Any feedback/recommendation on estimating the number of uniq values of a field? Do you still use this JS code?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment