Skip to content

Instantly share code, notes, and snippets.

@pyricau
Created September 7, 2023 00:22
Show Gist options
  • Star 9 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pyricau/07fd9598c5cdec0bc9f62505b6329df7 to your computer and use it in GitHub Desktop.
Save pyricau/07fd9598c5cdec0bc9f62505b6329df7 to your computer and use it in GitHub Desktop.
A Kotlin script to compare the output of two Macrobenchmark runs, validating the data and computing the confidence interval for a difference between two means
#!/usr/bin/env kotlin
@file:Repository("https://repo.maven.apache.org/maven2/")
@file:DependsOn("com.datumbox:datumbox-framework-lib:0.8.2")
@file:DependsOn("com.squareup.okio:okio:3.3.0")
@file:DependsOn("com.squareup.moshi:moshi:1.13.0")
@file:DependsOn("com.squareup.moshi:moshi-adapters:1.13.0")
@file:DependsOn("com.squareup.moshi:moshi-kotlin:1.13.0")
import com.squareup.moshi.Moshi
import com.squareup.moshi.kotlin.reflect.KotlinJsonAdapterFactory
import okio.FileSystem
import okio.Path
import okio.Path.Companion.toPath
import okio.buffer
import kotlin.math.pow
import com.datumbox.framework.common.dataobjects.FlatDataCollection
import com.datumbox.framework.core.statistics.nonparametrics.onesample.ShapiroWilk
import java.text.DecimalFormat
import kotlin.math.roundToInt
import kotlin.math.sqrt
typealias DoubleRange = ClosedFloatingPointRange<Double>
check(args.size == 2) {
"Expecting two files."
}
val pathToBenchmarkJsonFile1 = args[0].toPath()
val pathToBenchmarkJsonFile2 = args[1].toPath()
val analysis1 = args[0].toPath().parseMacrobenchmarkJson()
val analysis2 = args[1].toPath().parseMacrobenchmarkJson()
val comparison = compare(analysis1, analysis2)
for ((testName, metrics) in comparison.metricComparisonsByTest.entries) {
println("###########################################################################")
println("Results for $testName")
for ((metricName, comparison) in metrics.entries) {
println("##################################################")
println(metricName)
// zScore for confidence level 95%
val zScore = 1.96
val confidenceInterval = comparison.computeConfidenceInterval(zScore)
val meanDifferenceRange = confidenceInterval.meanDifferenceRange
val meanDifferencePercentRange = confidenceInterval.meanDifferencePercentRange
val twoDecimals = DecimalFormat("#.##")
println("#########################")
println("DATA CHECKS")
if (comparison.allChecksPass) {
println("✓ All checks passed, the comparison conclusion is meaningful.\n")
} else {
println("˟ Some checks did not pass, the comparison conclusion is NOT meaningful.\n")
}
println(
"""
Data checks for Benchmark 1
- ${comparison.metric1.checkEnoughIterations.check()} At least 30 iterations (${comparison.metric1.sampleSize})
- ${comparison.metric1.checkCoefficientOfVariationLowEnough.check()} CV (${twoDecimals.format(comparison.metric1.coefficientOfVariation * 100)}) <= 6%
- ${comparison.metric1.checkLatenciesPassNormalityTest.check()} Latencies pass normality test
Data checks for Benchmark 2
- ${comparison.metric2.checkEnoughIterations.check()} At least 30 iterations (${comparison.metric2.sampleSize})
- ${comparison.metric2.checkCoefficientOfVariationLowEnough.check()} CV (${twoDecimals.format(comparison.metric2.coefficientOfVariation * 100)}) <= 6%
- ${comparison.metric2.checkLatenciesPassNormalityTest.check()} Latencies pass normality test
- ${comparison.checkVarianceLessThanDouble.check()} Variance less than doubles (${twoDecimals.format(comparison.varianceRatio)})
#########################
""".trimIndent()
)
println("RESULT")
println("Mean difference confidence interval at 95% confidence level:")
when {
0.0 in meanDifferenceRange -> {
println(
"The change yielded no statistical significance (the mean difference confidence interval crosses 0): "
+ "from "
+ "${meanDifferenceRange.start.roundToInt()} ms (${
twoDecimals.format(
meanDifferencePercentRange.start * 100
)
}%) "
+ "to "
+ "${meanDifferenceRange.endInclusive.roundToInt()} ms (${
twoDecimals.format(
meanDifferencePercentRange.endInclusive * 100
)
}%)."
)
}
meanDifferenceRange.endInclusive < 0.0 -> {
println(
"The change yielded a mean improvement of "
+ "${meanDifferenceRange.endInclusive.roundToInt()} ms (${
twoDecimals.format(
meanDifferencePercentRange.endInclusive * 100
)
}%) "
+ "to "
+ "${meanDifferenceRange.start.roundToInt()} ms (${
twoDecimals.format(
meanDifferencePercentRange.start * 100
)
}%)."
)
}
else -> {
println(
"The change yielded a mean regression of "
+ "${meanDifferenceRange.start.roundToInt()} ms (${
twoDecimals.format(
meanDifferencePercentRange.start * 100
)
}%) "
+ "to "
+ "${meanDifferenceRange.endInclusive.roundToInt()} ms (${
twoDecimals.format(
meanDifferencePercentRange.endInclusive * 100
)
}%)."
)
}
}
println("#########################")
println("MEDIANS")
println("The median went from ${comparison.metric1.median.roundToInt()} ms to ${comparison.metric2.median.roundToInt()} ms.")
println("DO NOT REPORT THE DIFFERENCE IN MEDIANS.")
println("This data helps contextualize results but is not statistically meaningful.")
println("#########################")
}
}
fun Boolean.check() = if (this) "✓" else "˟"
fun Path.parseMacrobenchmarkJson(): BenchmarksData {
val jsonSource = FileSystem.SYSTEM
.source(this)
.buffer()
.readUtf8()
val jsonParser: Moshi = Moshi.Builder()
.add(KotlinJsonAdapterFactory())
.build()
return jsonParser
.adapter(BenchmarksData::class.java)
.fromJson(jsonSource) as BenchmarksData
}
fun List<Double>.variance(): Double {
var sum = 0.0
val mean = average()
forEach { value ->
sum += (value - mean).pow(2)
}
return sum / size
}
data class BenchmarksData(
val benchmarks: List<Benchmark>
)
data class Benchmark(
val name: String,
val className: String,
val metrics: Map<String, Metric>,
val repeatIterations: Int
) {
val testName by lazy {
"${className}#${name}"
}
}
data class Metric(
val runs: List<Double>
) {
val mean by lazy {
runs.average()
}
val median by lazy {
runs.p(50)
}
val variance by lazy {
runs.variance()
}
val sampleSize: Int
get() = runs.size
val standardDeviation by lazy {
sqrt(variance)
}
val coefficientOfVariation by lazy {
standardDeviation / mean
}
val checkEnoughIterations by lazy {
sampleSize >= 30
}
val checkLatenciesPassNormalityTest by lazy {
// null hypothesis: the distribution is normal.
// alpha level (5%): probability of wrongly rejecting the hypothesis that the distribution is normal (null hypothesis).
val alphaLevel = 0.05
val rejectNullHypothesis = ShapiroWilk.test(FlatDataCollection(runs), alphaLevel)
!rejectNullHypothesis
}
val checkCoefficientOfVariationLowEnough by lazy {
coefficientOfVariation <= 0.06
}
private fun List<Double>.variance(): Double {
var sum = 0.0
val mean = average()
forEach { value ->
sum += (value - mean).pow(2)
}
return sum / size
}
private fun List<Double>.p(percentile: Int): Double {
val idealIndex = percentile.coerceIn(0, 100) / 100.0 * (size - 1)
val firstIndex = idealIndex.toInt()
val secondIndex = firstIndex + 1
val firstValue = this[firstIndex]
val secondValue = getOrElse(secondIndex) { firstValue }
return lerp(firstValue, secondValue, idealIndex - firstIndex)
}
/** lerp is a classic function name for linear interpolation */
private fun lerp(
a: Double,
b: Double,
ratio: Double
): Double {
return (a * (1 - ratio) + b * (ratio))
}
}
data class PairedBenchmarkComparison(
val benchmarkData1: BenchmarksData,
val benchmarkData2: BenchmarksData,
val metricComparisonsByTest: Map<String, Map<String, MetricComparison>>
)
data class MetricComparison(
val metric1: Metric,
val metric2: Metric
) {
val varianceRatio by lazy {
metric2.variance / metric1.variance
}
val checkVarianceLessThanDouble by lazy {
varianceRatio in 0.5..2.0
}
val allChecksPass by lazy {
metric1.checkEnoughIterations &&
metric1.checkCoefficientOfVariationLowEnough &&
metric1.checkLatenciesPassNormalityTest &&
metric2.checkEnoughIterations &&
metric2.checkCoefficientOfVariationLowEnough &&
metric2.checkLatenciesPassNormalityTest &&
checkVarianceLessThanDouble
}
val pooledEstimateOfStandardDeviation by lazy {
val sizeMinusOne1 = metric1.sampleSize - 1
val sizeMinusOne2 = metric2.sampleSize - 1
sqrt(
(sizeMinusOne1 * metric1.variance + sizeMinusOne2 * metric2.variance)
/
(sizeMinusOne1 + sizeMinusOne2)
)
}
val standardError by lazy {
pooledEstimateOfStandardDeviation * sqrt((1.0 / metric1.sampleSize) + (1.0 / metric2.sampleSize))
}
fun computeConfidenceInterval(zScore: Double) = ConfidenceInterval(zScore, this)
}
class ConfidenceInterval(
val zScore: Double,
val metrics: MetricComparison
) {
val errorMargin by lazy {
zScore * metrics.standardError
}
val range by lazy {
errorMargin * 2
}
val meanDifference by lazy {
metrics.metric2.mean - metrics.metric1.mean
}
val meanDifferenceRange by lazy {
(meanDifference - errorMargin).rangeTo(meanDifference + errorMargin)
}
val meanDifferencePercentRange by lazy {
(meanDifferenceRange.start / metrics.metric1.mean).rangeTo(meanDifferenceRange.endInclusive / metrics.metric1.mean)
}
}
fun compare(
benchmarkData1: BenchmarksData,
benchmarkData2: BenchmarksData
): PairedBenchmarkComparison {
val tests1 = benchmarkData1.benchmarks.associateBy { it.testName }
val tests2 = benchmarkData2.benchmarks.associateBy { it.testName }
check(tests1.keys == tests2.keys) {
"Expected exact same set of tests between ${tests1.keys} and ${tests2.keys}"
}
val testsWithPairedData = tests1.mapValues { (testName, benchmark1) ->
val benchmark2 = tests2.getValue(testName)
check(benchmark1.metrics.keys == benchmark2.metrics.keys) {
"Expected exact same set of metrics for $testName between ${benchmark1.metrics.keys} and ${benchmark2.metrics.keys}"
}
benchmark1.metrics.mapValues { (metricName, metric1) ->
val metric2 = benchmark2.metrics.getValue(metricName)
MetricComparison(metric1, metric2)
}
}
return PairedBenchmarkComparison(benchmarkData1, benchmarkData2, testsWithPairedData)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment