Skip to content

Instantly share code, notes, and snippets.

@yukim
Created May 4, 2012 21:43
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yukim/2597943 to your computer and use it in GitHub Desktop.
Save yukim/2597943 to your computer and use it in GitHub Desktop.
HyperLogLog
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.utils;
import java.nio.ByteBuffer;
/**
* Generic interface for cardinality calculator.
*/
public interface EstimatedCardinality
{
/**
* @param value the value seen
*/
void update(ByteBuffer value);
/**
* @return estimated cardinality
*/
double get();
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.utils;
import java.nio.ByteBuffer;
/**
* Cardinality estimation using HyperLogLog algorithm described in
* "HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm" by
* P. Flajolet et al.
*/
public class HyperLogLogCounter implements EstimatedCardinality
{
private static final long Pow2_32 = 1L << 32;
private final int b;
// number of registers m = 2^b
private final int m;
private final double alphaM;
// a collection of m registers
private int[] M;
/**
* Create HyperLogLogCounter with design parameter b.
* b must be in the range [4, 16] for optimization.
*
* @param b parameter
*/
public HyperLogLogCounter(int b)
{
assert b >= 4 && b <= 16;
this.b = b;
this.m = 1 << b;
this.alphaM =
b == 4 ? 0.673 // m == 16
: b == 5 ? 0.697 // m == 32
: b == 6 ? 0.709 // m == 64
: 0.7213 / (1 + 1.079 / m);
this.M = new int[m];
}
public void update(ByteBuffer value)
{
int x = MurmurHash.hash32(value, value.position(), value.remaining(), -1);
// determine position in register using first b bits of hashed value x
int j = x >>> (Integer.SIZE - b);
M[j] = Math.max(M[j], rank((x << b) | (1 << (b - 1)) + 1));
}
public double get()
{
// compute Z with 'indicator' function
double Z = 0.0;
for (int i = 0; i < m; ++i)
Z += 1.0 / (1 << M[i]);
// "raw" HyperLogLog estimate
double E = alphaM * m * m / Z;
if (E <= (5.0 / 2.0) * m)
{
// small range correction
int V = 0;
for (int v : M)
if (v == 0) V++;
return V == 0 ? E : m * Math.log(m / V);
}
else if (E <= Pow2_32 / 30.0)
{
// intermediate range collection - no correction
return E;
}
else
{
// large range correction
return -1 * Pow2_32 * Math.log(1.0 - E / Pow2_32);
}
}
/**
* Determine the leftmost position of 1-bit
*
* @param w
* @return the position of the leftmost 1-bit (rank(00011010) = 4)
*/
int rank(int w)
{
return w == 0 ? 0 : 1 + Integer.numberOfLeadingZeros(w);
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.utils;
import org.junit.Before;
import org.junit.Test;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static junit.framework.Assert.assertTrue;
public class HyperLogLogCounterTest
{
private List<String> wordList;
@Before
public void setUp() throws IOException
{
wordList = new ArrayList<String>();
BufferedReader reader = new BufferedReader(new FileReader("/usr/share/dict/words"));
try
{
String line = reader.readLine();
while (line != null)
{
for (String w : Arrays.asList(line.split("[,\\s]+")))
if (w.length() > 0)
wordList.add(w);
line = reader.readLine();
}
}
finally
{
reader.close();
}
}
@Test
public void test()
{
for (int i = 5; i < 17; i++)
{
double expectedError = 1.04 / Math.sqrt(1 << i);
testCardinality(new HyperLogLogCounter(i), expectedError);
}
}
private void testCardinality(EstimatedCardinality counter, double expectedError)
{
for (String w : wordList)
counter.update(ByteBufferUtil.bytes(w));
double count = counter.get();
double actualError = (count - wordList.size()) / wordList.size();
assertTrue("error rate should be in ±" + (expectedError * 100) + "%, but actually " + (actualError * 100), Math.abs(expectedError) >= Math.abs(actualError));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment