Skip to content

Instantly share code, notes, and snippets.

@gom
Last active February 5, 2019 06:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gom/6b4fd5f4c27de49ab912d9ec251383fa to your computer and use it in GitHub Desktop.
Save gom/6b4fd5f4c27de49ab912d9ec251383fa to your computer and use it in GitHub Desktop.
Compare HLL Benchmark: airlift.stats and stream-lib
plugins {
id 'java'
id 'application'
id "com.github.johnrengelman.shadow" version "4.0.4"
}
group 'com.gomlog'
version '1.0-SNAPSHOT'
mainClassName = 'com.gomlog.hll.benchmark.Comparison'
repositories {
mavenCentral()
}
dependencies {
compile "com.clearspring.analytics:stream:2.9.6"
compile "io.airlift:stats:0.178"
}
package com.gomlog.hll.benchmark;
import java.util.UUID;
import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
import io.airlift.slice.Slices;
import io.airlift.stats.cardinality.HyperLogLog;
public class Comparison {
static final int DATA_ROWS = 10_000_000;
static final int FIXED_DATA_ROWS = 1;
static final int DEFAULT_P = 15;
static final int DEFAULT_SP = 25;
public static void main(String[] args) throws Exception {
for (int i = 0; i < 2; i++) {
if (i == 0) {
System.out.println("Reharsal");
} else {
System.out.println("----------");
}
streamHll();
airliftHll();
}
}
private static String data() {
return UUID.randomUUID().toString();
}
private static String fixedData() {
return "0123abCD-4567-6789-89ab-cdefABCDEF01";
}
private static void printResult(String name, Long estimateCount, int size, Long durationMsec) {
double error = 1d - estimateCount.doubleValue() / (DATA_ROWS + FIXED_DATA_ROWS);
System.out.printf("%s [error: %f, calcTime: %d, estimateCount: %d, dataSize: %d bytes]\n", name, error,
durationMsec, estimateCount, size);
}
private static void streamHll() throws Exception {
HyperLogLogPlus hll = new HyperLogLogPlus(DEFAULT_P, DEFAULT_SP);
long start = System.currentTimeMillis();
for (int i = 0; i < DATA_ROWS; i++) {
hll.offer(data().getBytes());
}
for (int i = 0; i < DATA_ROWS; i++) {
hll.offer(fixedData().getBytes());
}
long duration = System.currentTimeMillis() - start;
printResult("stream-lib",
hll.cardinality(),
hll.sizeof(),
duration);
}
private static void airliftHll() {
// 2048
//int buckets = Integer.highestOneBit((int) Math.ceil(1.0816 / (0.023 * 0.023)) - 1) << 1;
HyperLogLog hll = HyperLogLog.newInstance(4096);
long start = System.currentTimeMillis();
for (int i = 0; i < DATA_ROWS; i++) {
hll.add(Slices.utf8Slice(data()));
}
for (int i = 0; i < DATA_ROWS; i++) {
hll.add(Slices.utf8Slice(fixedData()));
}
long duration = System.currentTimeMillis() - start;
printResult("airlift",
hll.cardinality(),
hll.estimatedInMemorySize(),
duration);
}
}
$ java -classpath build/libs/hllbenchmark-1.0-SNAPSHOT-all.jar com.gomlog.hll.benchmark.Comparison
Reharsal
stream-lib [error: 0.006106, calcTime: 25014, estimateCount: 9938938, dataSize: 21858 bytes]
airlift [error: -0.013953, calcTime: 19990, estimateCount: 10139534, dataSize: 2182 bytes]
----------
stream-lib [error: -0.005583, calcTime: 22240, estimateCount: 10055835, dataSize: 21858 bytes]
airlift [error: -0.025534, calcTime: 20773, estimateCount: 10255340, dataSize: 2182 bytes]
package com.gomlog.hll.benchmark;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
public class AddAndMerge {
private static final int DATA_ROWS = 100_000_000;
private static final int FIXED_DATA_ROWS = 1;
private static final int DEFAULT_P = 15;
private static final int DEFAULT_SP = 25;
public static void main(String[] args) throws Exception {
for (int i = 0; i < 2; i++) {
if (i == 0) {
System.out.println("Reharsal");
} else {
System.out.println("----------");
}
streamHll();
mergeHLL();
}
}
private static String data() {
return UUID.randomUUID().toString();
}
private static String fixedData() {
return "0123abCD-4567-6789-89ab-cdefABCDEF01";
}
private static void printResult(String name, Long estimateCount, int size, Long durationMsec) {
double error = 1d - estimateCount.doubleValue() / (DATA_ROWS + FIXED_DATA_ROWS);
System.out.printf("%s [error: %f, calcTime: %d, estimateCount: %d, dataSize: %d bytes]\n", name, error,
durationMsec, estimateCount, size);
}
private static void streamHll() throws Exception {
HyperLogLogPlus hll = new HyperLogLogPlus(DEFAULT_P, DEFAULT_SP);
long start = System.currentTimeMillis();
for (int i = 0; i < DATA_ROWS; i++) {
hll.offer(data().getBytes());
}
long duration = System.currentTimeMillis() - start;
printResult("stream-lib",
hll.cardinality(),
hll.sizeof(),
duration);
}
private static final int OBJ_BY_BUCKET = 10;
private static void mergeHLL() {
List<HyperLogLogPlus> stateList = new ArrayList<>();
int bucketNums = DATA_ROWS / OBJ_BY_BUCKET;
for (int i = 0; i < bucketNums; i++) {
HyperLogLogPlus hll = new HyperLogLogPlus(DEFAULT_P, DEFAULT_SP);
for (int j = 0; j < OBJ_BY_BUCKET; j++) {
hll.offer(data().getBytes());
}
stateList.add(hll);
}
long start = System.currentTimeMillis();
stateList.stream().reduce((accum, h) -> {
try {
accum.addAll(h);
return accum;
} catch (Exception e) {
throw new RuntimeException(e);
}
}).ifPresent(h -> {
long duration = System.currentTimeMillis() - start;
printResult("merge",
h.cardinality(),
h.sizeof(),
duration);
});
}
}
❯ java -jar build/libs/hllbenchmark-1.0-SNAPSHOT.jar com.gomlog.hll.benchmark.AddAndMerge
Reharsal
stream-lib [error: -0.004718, calcTime: 152388, estimateCount: 100471783, dataSize: 21848 bytes]
merge [error: 0.004813, calcTime: 7556, estimateCount: 99518687, dataSize: 21848 bytes]
----------
stream-lib [error: 0.002818, calcTime: 174693, estimateCount: 99718204, dataSize: 21848 bytes]
merge [error: -0.004745, calcTime: 7123, estimateCount: 100474510, dataSize: 21848 bytes]
package com.gomlog.hll.benchmark;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import io.airlift.slice.Slices;
import io.airlift.stats.cardinality.HyperLogLog;
public class AddAndMerge {
private static final int DATA_ROWS = 100_000_000;
private static final int FIXED_DATA_ROWS = 1;
private static final int DEFAULT_P = 15;
private static final int DEFAULT_SP = 25;
public static void main(String[] args) throws Exception {
for (int i = 0; i < 2; i++) {
if (i == 0) {
System.out.println("Reharsal");
} else {
System.out.println("----------");
}
airliftHll();
mergeHLL();
}
}
private static String data() {
return UUID.randomUUID().toString();
}
private static String fixedData() {
return "0123abCD-4567-6789-89ab-cdefABCDEF01";
}
private static void printResult(String name, Long estimateCount, int size, Long durationMsec) {
double error = 1d - estimateCount.doubleValue() / (DATA_ROWS + FIXED_DATA_ROWS);
System.out.printf("%s [error: %f, calcTime: %d, estimateCount: %d, dataSize: %d bytes]\n", name, error,
durationMsec, estimateCount, size);
}
private static void airliftHll() {
HyperLogLog hll = HyperLogLog.newInstance(4096);
long start = System.currentTimeMillis();
for (int i = 0; i < DATA_ROWS; i++) {
hll.add(Slices.utf8Slice(data()));
}
long duration = System.currentTimeMillis() - start;
printResult("airlift",
hll.cardinality(),
hll.estimatedInMemorySize(),
duration);
}
private static final int OBJ_BY_BUCKET = 100;
private static void mergeHLL() {
List<HyperLogLog> stateList = new ArrayList<>();
int bucketNums = DATA_ROWS / OBJ_BY_BUCKET;
for (int i = 0; i < bucketNums; i++) {
HyperLogLog hll = HyperLogLog.newInstance(4096);
for (int j = 0; j < OBJ_BY_BUCKET; j++) {
hll.add(Slices.utf8Slice(data()));
}
stateList.add(hll);
}
long start = System.currentTimeMillis();
stateList.stream().reduce((accum, h) -> {
try {
accum.mergeWith(h);
return accum;
} catch (Exception e) {
throw new RuntimeException(e);
}
}).ifPresent(h -> {
long duration = System.currentTimeMillis() - start;
printResult("merge",
h.cardinality(),
h.estimatedInMemorySize(),
duration);
});
}
}
❯ java -jar build/libs/hllbenchmark-1.0-SNAPSHOT.jar com.gomlog.hll.benchmark.AddAndMerge
Reharsal
airlift [error: 0.001307, calcTime: 151422, estimateCount: 99869317, dataSize: 2182 bytes]
merge [error: -0.026703, calcTime: 26533, estimateCount: 102670315, dataSize: 2177 bytes]
----------
airlift [error: -0.010419, calcTime: 151927, estimateCount: 101041874, dataSize: 2182 bytes]
merge [error: -0.019530, calcTime: 26263, estimateCount: 101953007, dataSize: 2177 bytes]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment