Skip to content

Instantly share code, notes, and snippets.

@miguno
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save miguno/00217d87fd340ad7f370 to your computer and use it in GitHub Desktop.
Save miguno/00217d87fd340ad7f370 to your computer and use it in GitHub Desktop.
Benchmarking Count-Min Sketch and SketchMap implementations in Algebird
package com.twitter.algebird.caliper
import com.google.caliper.api.Macrobenchmark
import com.google.caliper.{Param, Benchmark}
import com.twitter.algebird.CountMinSketchMonoid
/**
* Run via [[com.twiter.algebird.caliper.Runner]], using the following CLI options:
*
* "--time-limit 90s --instrument macro com.twitter.algebird.caliper.CMSBenchmark"
*
* JVM parameters: "-server -Xms256m -Xmx512m"
*/
class CMSBenchmark extends Benchmark {
@Param(Array("0.1", "0.005"))
val eps: Double = 0.0
@Param(Array("0.0000001" /* 1E-8 */))
val delta: Double = 0.0
@Param(Array("0.2"))
val heavyHittersPct: Double = 0.0
@Param(Array("100"))
val operations: Int = 0
var cmsMonoid: CountMinSketchMonoid = _
override def setUp {
cmsMonoid = {
val seed = 1
new CountMinSketchMonoid(eps, delta, seed, heavyHittersPct)
}
}
@Macrobenchmark
def timePlus() {
(1 to operations).view.foldLeft(cmsMonoid.zero)((l, r) => l ++ cmsMonoid.create(r))
}
}
package com.twitter.algebird.caliper
import com.google.caliper.api.Macrobenchmark
import com.google.caliper.{Param, Benchmark}
import com.twitter.algebird.{GenCountMinSketchImplicits, GenCountMinSketchMonoid}
/**
* Run via [[com.twiter.algebird.caliper.Runner]], using the following CLI options:
*
* "--time-limit 90s --instrument macro com.twitter.algebird.caliper.GenericCMSBenchmark"
*
* JVM parameters: "-server -Xms256m -Xmx512m"
*/
class GenericCMSBenchmark extends Benchmark {
type NUM = BigInt // or Long
@Param(Array("0.1", "0.005"))
val eps: Double = 0.0
@Param(Array("0.0000001" /* 1E-8 */))
val delta: Double = 0.0
@Param(Array("0.2"))
val heavyHittersPct: Double = 0.0
@Param(Array("100"))
val operations: Int = 0
@Param(Array("2048"))
val maxBits: Int = 0
var random: scala.util.Random = _
var cmsMonoid: GenCountMinSketchMonoid[NUM] = _
override def setUp {
// Required import of implicit values (e.g. for BigInt- or Long-backed CMS instances)
import GenCountMinSketchImplicits._
cmsMonoid = {
val seed = 1
new GenCountMinSketchMonoid[NUM](eps, delta, seed, heavyHittersPct)
}
random = new scala.util.Random
}
@Macrobenchmark
def timePlus() {
(1 to operations).view.foldLeft(cmsMonoid.zero)((l, r) => {
// Case A: K=Long, counting the sequence (1, 2, ..., operations)
//l ++ cmsMonoid.create(r)
// Case B.1: K=BigInt, counting the sequence (1, 2, ..., operations)
//l ++ cmsMonoid.create(r)
// Case B.2: K=BigInt, drawing numbers randomly from 2^maxBits address space
val n = scala.math.BigInt(maxBits, random)
l ++ cmsMonoid.create(n)
})
}
}
package com.twitter.algebird.caliper
import com.google.caliper.api.Macrobenchmark
import com.google.caliper.{Param, Benchmark}
import com.twitter.algebird.{SketchMapMonoid, SketchMapParams}
/**
* Run via [[com.twiter.algebird.caliper.Runner]], using the following CLI options:
*
* "--time-limit 300s --instrument macro com.twitter.algebird.caliper.SketchMapBenchmark"
*
* JVM parameters: "-server -Xms256m -Xmx512m"
*/
class SketchMapBenchmark extends Benchmark {
type K = BigInt // or Long
type V = Long
@Param(Array("0.1", "0.005"))
val eps: Double = 0.0
@Param(Array("0.0000001" /* 1E-8 */))
val delta: Double = 0.0
@Param(Array("100", "1000"))
val heavyHittersCount: Int = 0
@Param(Array("100"))
val operations: Int = 0
@Param(Array("2048"))
val maxBits: Int = 0
var random: scala.util.Random = _
var sketchMapMonoid: SketchMapMonoid[K, V] = _
override def setUp {
sketchMapMonoid = {
// for SketchMap[K=Long, Long]
//import com.twitter.bijection.Conversion.asMethod
//def explicitSerialization(i: K): Array[Byte] = i.as[Array[Byte]]
// for SketchMap[K=BigInt, Long]
def explicitSerialization(i: K): Array[Byte] = i.toByteArray
val seed = 1
val params = SketchMapParams[K](seed, eps, delta, heavyHittersCount)(explicitSerialization)
new SketchMapMonoid[K, V](params)
}
random = new scala.util.Random
}
@Macrobenchmark
def timePlus() {
(1 to operations).view.foldLeft(sketchMapMonoid.zero)((l, r) => {
// Case A: K=Long, counting the sequence (1, 2, ..., operations)
//val pair = (r.toLong, 1L)
//sketchMapMonoid.plus(l, sketchMapMonoid.create(pair))
// Case B.1: K=BigInt, counting the sequence (1, 2, ..., operations)
//val pair = (BigInt(r), 1L)
//sketchMapMonoid.plus(l, sketchMapMonoid.create(pair))
// Case B.2: K=BigInt, drawing numbers randomly from 2^maxBits address space
val n = scala.math.BigInt(maxBits, random)
val pair = (n, 1L)
sketchMapMonoid.plus(l, sketchMapMonoid.create(pair))
})
}
}

Summary results, variant A

For the following parameters:

  • eps = 0.005
  • delta = 1E-8
  • seed = 1
  • heavyHittersPct: 0.2 (CMS variants) / heavyHittersCount: 1000 (SketchMap)

Input data: 100 small numbers, i.e. the sequence (1, 2, ..., 100).

Results:

(*) denotes new, generic CMS implementation

Summary results, variant B

Same parameters as above but different input data. This benchmark, when run against BigInt, is expected to run slower than benchmark variant A above because BigInt operations become slower the larger the numbers represented by a BigInt are.

Input data: large(r) numbers, i.e. randomly drawn numbers from a 2^2048 address space.

Because of the input data characteristics this benchmark variant cannot be run for Long.

Results:

(*) denotes new, generic CMS implementation

Notes

Several benchmarks logged warnings and/or errors similar to the following:

WARNING: Hotspot compilation occurred during timing. Depending on the scope of the benchmark, this might significantly impact results. Consider running with a longer warmup.

WARNING: Hotspot compilation occurred after warmup, but outside of timing. Depending on the scope of the benchmark, this might significantly impact results. Consider running with a longer warmup.

WARNING: GC occurred during timing. Depending on the scope of the benchmark, this might significantly impact results. Consider running with a larger heap size.

Benchmark environment

  • MacBook Pro Retina (Late 2013), 2.3 GHz Intel Core i7, 16 GB RAM
  • Mac OS X 10.9.5
  • Oracle Java 7 (1.7.0_55)
  • IntelliJ IDEA 13.1

Caliper references

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment