Skip to content

Instantly share code, notes, and snippets.

@s-j
Last active December 3, 2017 19:19
Show Gist options
  • Save s-j/2303b1233b8a932f6c3803c60ce7f372 to your computer and use it in GitHub Desktop.
Save s-j/2303b1233b8a932f6c3803c60ce7f372 to your computer and use it in GitHub Desktop.
import net.agkn.hll.*;
import com.clearspring.analytics.stream.cardinality.*;
/**
* A brief comparison of the cardinality estimator implementations
* from Clearspring and Agregated Knowledge.
**/
public final class CardinalityBench {
private CardinalityBench() { }
private static boolean enablePrint = false;
private static int[] testBaseline(List<String> lines) {
if (enablePrint) {
println(lines.size() + " lines");
}
long start = System.currentTimeMillis();
Set<String> unique = Sets.newHashSet();
for (String line : lines) {
unique.add(line);
}
long stop = System.currentTimeMillis();
if (enablePrint) {
println(unique.size() + " keys (" + (stop - start) + "ms)");
}
start = System.currentTimeMillis();
Set<Long> uniqueHashes = Sets.newHashSet();
for (String line : lines) {
// Using the 32bit hash function provided with the implementation,
// performs better then murmur3 from Guava (both time and precision).
uniqueHashes.add(MurmurHash.hash64(line));
}
stop = System.currentTimeMillis();
if (enablePrint) {
println(uniqueHashes.size() + " hashed keys (" + (stop - start) + "ms)");
}
return new int[] {unique.size(), uniqueHashes.size()};
}
private static int[] getCardinalityAndSize(
ICardinality estimator, List<String> lines) throws IOException {
long start = System.currentTimeMillis();
for (String line : lines) {
estimator.offer(line);
}
long stop = System.currentTimeMillis();
int bytes = estimator.getBytes().length;
return new int[]{(int) estimator.cardinality(), bytes, (int) (stop - start)};
}
private static void printHeader() {
if (enablePrint) {
println(String.format("%s\t%s\t%s\t%s", "name", "error", "size", "time"));
}
}
private static void print(String name, int res, int base, int size, int time) {
if (enablePrint) {
println(String.format("%s\t%.4f\t%sB\t%sms",
name, ((double) (res - base) / base), size, time));
}
}
// Test linear counting tuned to support 10 mil unique items with 1% error.
private static void testLinearCounting(List<String> lines, int baseline)
throws IOException {
int[] result = getCardinalityAndSize(
new LinearCounting.Builder().withError(0.01, 10000000).build(), lines);
print("lincnt", result[0], baseline, result[1], result[2]);
}
// Tune the following three methods to 1% relative standard error.
// Estimation is based on the assumption m = (beta / error) ^ 2
// Values for beta are taken from the Flajolet's 2007 paper
private static void testLogLog(List<String> lines, int baseline)
throws IOException {
double rse = 0.01;
double beta = 1.3;
// Use 1.3 instead of 1.04 since the implementation
// does not throw the 30% of smallest values
int log2m = (int) (Math.log(Math.pow((beta / rse), 2)) / Math.log(2));
int[] result = getCardinalityAndSize(new LogLog(log2m), lines);
print("ll__" + log2m, result[0], baseline, result[1], result[2]);
}
private static void testHyperLogLog(List<String> lines, int baseline)
throws IOException {
double rsd = 0.01;
double beta = 1.106;
int log2m = (int) (Math.log(Math.pow(beta / rsd, 2)) / Math.log(2));
int[] result = getCardinalityAndSize(new HyperLogLog(log2m), lines);
print("hll_" + log2m, result[0], baseline, result[1], result[2]);
}
private static void testHyperLogLogPlus(List<String> lines, int baseline)
throws IOException {
double rsd = 0.01;
double beta = 1.04;
int log2m = (int) (Math.log(Math.pow(beta / rsd, 2)) / Math.log(2));
int[] result = getCardinalityAndSize(new HyperLogLogPlus(log2m, 32), lines);
print("hlp" + log2m, result[0], baseline, result[1], result[2]);
}
private static void testHyperLogLogPlus2(List<String> lines, int baseline)
throws IOException {
for (int log2m = 10; log2m <= 14; log2m++) {
int[] result = getCardinalityAndSize(new HyperLogLogPlus(log2m, 32), lines);
print("hlp" + log2m, result[0], baseline, result[1], result[2]);
}
}
private static void testHLL(List<String> lines, int baseline,
int log2m, int r, int ethr, HLLType type) throws IOException {
final HLL hll = new HLL(log2m, r, ethr, false, type);
long start = System.currentTimeMillis();
for (String line : lines) {
hll.addRaw(MurmurHash.hash64(line));
}
long stop = System.currentTimeMillis();
int cnt = (int) hll.cardinality();
int size = hll.toBytes().length;
print("hlx_" + log2m + "_" + r + "_" + ethr + "_" + type,
cnt, baseline, size, (int)(stop - start));
}
private static void testHLL(List<String> lines, int baseline) throws IOException {
for (int log2m = 10; log2m <= 14; log2m++) {
int r = 5;
testHLL(lines, baseline, log2m, r, -1, HLLType.FULL);
testHLL(lines, baseline, log2m, r, -1, HLLType.SPARSE);
}
}
private static void test(String type
throws IOException, CardinalityMergeException {
println("Testing " + type);
List<String> lines = Files.readLines(
new File("/home/simonj/testdata/" + type + ".txt"), Charsets.UTF_8);
int baseline = testBaseline(lines)[0];
println();
printHeader();
testLinearCounting(lines, baseline);
testLogLog(lines, baseline);
testHyperLogLog(lines, baseline);
testHyperLogLogPlus(lines, baseline);
println();
testHyperLogLogPlus2(lines, baseline);
println();
testHLL(lines, baseline);
println();
//test64(lines);
}
public static void main(String[] args)
throws IOException,CardinalityMergeException {
for (int i = 9; i < 10; i++) {
if (i < 9) {
print(i + ",");
} else {
enablePrint = true;
}
test("datasetA");
test("datasetB");
}
}
// Produces the same result as the original!
public static int getHLLP(List<String> lines)
throws IOException, CardinalityMergeException {
HyperLogLogPlus[] estimators = new HyperLogLogPlus[1];
for (int i = 0; i < estimators.length; i++) {
estimators[i] = new HyperLogLogPlus(13);
}
for (String line : lines) {
long value = Hashing.murmur3_128().hashUnencodedChars(line).asLong();
estimators[(int) (value & (estimators.length - 1))].offer(line);
}
return (int) estimators[0].merge(estimators).cardinality();
}
// Produces the same result as the original!
public static int getHLLP64(List<String> lines)
throws IOException, CardinalityMergeException {
HyperLogLogPlus[] estimators = new HyperLogLogPlus[64];
for (int i = 0; i < estimators.length; i++) {
estimators[i] = new HyperLogLogPlus(13);
}
for (String line : lines) {
long value = Hashing.murmur3_128().hashUnencodedChars(line).asLong();
estimators[(int) (value & (estimators.length - 1))].offer(line);
}
return (int) estimators[0].merge(estimators).cardinality();
}
// Produces the same result as the original!
public static int getHLLP64v2(List<String> lines)
throws IOException, CardinalityMergeException {
HyperLogLogPlus[] estimators = new HyperLogLogPlus[64];
for (int i = 0; i < estimators.length; i++) {
estimators[i] = new HyperLogLogPlus(13);
}
for (String line : lines) {
long value = Hashing.murmur3_128().hashUnencodedChars(line).asLong();
estimators[(int)(Math.random() * estimators.length)].offer(line);
estimators[13].offer(line);
}
return (int) estimators[0].merge(estimators).cardinality();
}
private static void test64(List<String> lines)
throws IOException, CardinalityMergeException {
println(getHLLP(lines));
println(getHLLP64(lines));
println(getHLLP64v2(lines));
}
private static<T> void println(T arg) {
System.out.println(arg);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment