pyricau/compare.benchmarks.main.kts

## compare.benchmarks.main.kts
#!/usr/bin/env kotlin

@file:Repository("https://repo.maven.apache.org/maven2/")
@file:DependsOn("com.datumbox:datumbox-framework-lib:0.8.2")
@file:DependsOn("com.squareup.okio:okio:3.3.0")
@file:DependsOn("com.squareup.moshi:moshi:1.13.0")
@file:DependsOn("com.squareup.moshi:moshi-adapters:1.13.0")
@file:DependsOn("com.squareup.moshi:moshi-kotlin:1.13.0")

import com.squareup.moshi.Moshi
import com.squareup.moshi.kotlin.reflect.KotlinJsonAdapterFactory
import okio.FileSystem
import okio.Path
import okio.Path.Companion.toPath
import okio.buffer
import kotlin.math.pow
import com.datumbox.framework.common.dataobjects.FlatDataCollection
import com.datumbox.framework.core.statistics.nonparametrics.onesample.ShapiroWilk
import java.text.DecimalFormat
import kotlin.math.roundToInt
import kotlin.math.sqrt

typealias DoubleRange = ClosedFloatingPointRange<Double>

check(args.size == 2) {
  "Expecting two files."
}

val pathToBenchmarkJsonFile1 = args[0].toPath()
val pathToBenchmarkJsonFile2 = args[1].toPath()

val analysis1 = args[0].toPath().parseMacrobenchmarkJson()
val analysis2 = args[1].toPath().parseMacrobenchmarkJson()

val comparison = compare(analysis1, analysis2)

for ((testName, metrics) in comparison.metricComparisonsByTest.entries) {
  println("###########################################################################")
  println("Results for $testName")
  for ((metricName, comparison) in metrics.entries) {
    println("##################################################")
    println(metricName)
    // zScore for confidence level 95%
    val zScore = 1.96
    val confidenceInterval = comparison.computeConfidenceInterval(zScore)

    val meanDifferenceRange = confidenceInterval.meanDifferenceRange
    val meanDifferencePercentRange = confidenceInterval.meanDifferencePercentRange

    val twoDecimals = DecimalFormat("#.##")

    println("#########################")
    println("DATA CHECKS")
    if (comparison.allChecksPass) {
      println("✓ All checks passed, the comparison conclusion is meaningful.\n")
    } else {
      println("˟ Some checks did not pass, the comparison conclusion is NOT meaningful.\n")
    }

    println(
      """
      Data checks for Benchmark 1
      - ${comparison.metric1.checkEnoughIterations.check()} At least 30 iterations (${comparison.metric1.sampleSize})
      - ${comparison.metric1.checkCoefficientOfVariationLowEnough.check()} CV (${twoDecimals.format(comparison.metric1.coefficientOfVariation * 100)}) <= 6%
      - ${comparison.metric1.checkLatenciesPassNormalityTest.check()} Latencies pass normality test

      Data checks for Benchmark 2
      - ${comparison.metric2.checkEnoughIterations.check()} At least 30 iterations (${comparison.metric2.sampleSize})
      - ${comparison.metric2.checkCoefficientOfVariationLowEnough.check()} CV (${twoDecimals.format(comparison.metric2.coefficientOfVariation * 100)}) <= 6%
      - ${comparison.metric2.checkLatenciesPassNormalityTest.check()} Latencies pass normality test

      - ${comparison.checkVarianceLessThanDouble.check()} Variance less than doubles (${twoDecimals.format(comparison.varianceRatio)})
      #########################
    """.trimIndent()
    )

    println("RESULT")
    println("Mean difference confidence interval at 95% confidence level:")
    when {
      0.0 in meanDifferenceRange -> {
        println(
          "The change yielded no statistical significance (the mean difference confidence interval crosses 0): "
            + "from "
            + "${meanDifferenceRange.start.roundToInt()} ms (${
            twoDecimals.format(
              meanDifferencePercentRange.start * 100
            )
          }%) "
            + "to "
            + "${meanDifferenceRange.endInclusive.roundToInt()} ms (${
            twoDecimals.format(
              meanDifferencePercentRange.endInclusive * 100
            )
          }%)."
        )
      }

      meanDifferenceRange.endInclusive < 0.0 -> {
        println(
          "The change yielded a mean improvement of "
            + "${meanDifferenceRange.endInclusive.roundToInt()} ms (${
            twoDecimals.format(
              meanDifferencePercentRange.endInclusive * 100
            )
          }%) "
            + "to "
            + "${meanDifferenceRange.start.roundToInt()} ms (${
            twoDecimals.format(
              meanDifferencePercentRange.start * 100
            )
          }%)."
        )
      }

      else -> {
        println(
          "The change yielded a mean regression of "
            + "${meanDifferenceRange.start.roundToInt()} ms (${
            twoDecimals.format(
              meanDifferencePercentRange.start * 100
            )
          }%) "
            + "to "
            + "${meanDifferenceRange.endInclusive.roundToInt()} ms (${
            twoDecimals.format(
              meanDifferencePercentRange.endInclusive * 100
            )
          }%)."
        )
      }
    }
    println("#########################")
    println("MEDIANS")
    println("The median went from ${comparison.metric1.median.roundToInt()} ms to ${comparison.metric2.median.roundToInt()} ms.")
    println("DO NOT REPORT THE DIFFERENCE IN MEDIANS.")
    println("This data helps contextualize results but is not statistically meaningful.")
    println("#########################")
  }
}

fun Boolean.check() = if (this) "✓" else "˟"

fun Path.parseMacrobenchmarkJson(): BenchmarksData {
  val jsonSource = FileSystem.SYSTEM
    .source(this)
    .buffer()
    .readUtf8()

  val jsonParser: Moshi = Moshi.Builder()
    .add(KotlinJsonAdapterFactory())
    .build()

  return jsonParser
    .adapter(BenchmarksData::class.java)
    .fromJson(jsonSource) as BenchmarksData
}

fun List<Double>.variance(): Double {
  var sum = 0.0
  val mean = average()
  forEach { value ->
    sum += (value - mean).pow(2)
  }
  return sum / size
}

data class BenchmarksData(
  val benchmarks: List<Benchmark>
)

data class Benchmark(
  val name: String,
  val className: String,
  val metrics: Map<String, Metric>,
  val repeatIterations: Int
) {

  val testName by lazy {
    "${className}#${name}"
  }
}

data class Metric(
  val runs: List<Double>
) {

  val mean by lazy {
    runs.average()
  }
  val median by lazy {
    runs.p(50)
  }
  val variance by lazy {
    runs.variance()
  }
  val sampleSize: Int
    get() = runs.size

  val standardDeviation by lazy {
    sqrt(variance)
  }
  val coefficientOfVariation by lazy {
    standardDeviation / mean
  }

  val checkEnoughIterations by lazy {
    sampleSize >= 30
  }
  val checkLatenciesPassNormalityTest by lazy {
    // null hypothesis: the distribution is normal.
    // alpha level (5%): probability of wrongly rejecting the hypothesis that the distribution is normal (null hypothesis).
    val alphaLevel = 0.05
    val rejectNullHypothesis = ShapiroWilk.test(FlatDataCollection(runs), alphaLevel)
    !rejectNullHypothesis
  }
  val checkCoefficientOfVariationLowEnough by lazy {
    coefficientOfVariation <= 0.06
  }

  private fun List<Double>.variance(): Double {
    var sum = 0.0
    val mean = average()
    forEach { value ->
      sum += (value - mean).pow(2)
    }
    return sum / size
  }

  private fun List<Double>.p(percentile: Int): Double {
    val idealIndex = percentile.coerceIn(0, 100) / 100.0 * (size - 1)
    val firstIndex = idealIndex.toInt()
    val secondIndex = firstIndex + 1

    val firstValue = this[firstIndex]
    val secondValue = getOrElse(secondIndex) { firstValue }
    return lerp(firstValue, secondValue, idealIndex - firstIndex)
  }

  /** lerp is a classic function name for linear interpolation */
  private fun lerp(
    a: Double,
    b: Double,
    ratio: Double
  ): Double {
    return (a * (1 - ratio) + b * (ratio))
  }
}

data class PairedBenchmarkComparison(
  val benchmarkData1: BenchmarksData,
  val benchmarkData2: BenchmarksData,
  val metricComparisonsByTest: Map<String, Map<String, MetricComparison>>
)

data class MetricComparison(
  val metric1: Metric,
  val metric2: Metric
) {

  val varianceRatio by lazy {
    metric2.variance / metric1.variance
  }
  val checkVarianceLessThanDouble by lazy {
    varianceRatio in 0.5..2.0
  }

  val allChecksPass by lazy {
    metric1.checkEnoughIterations &&
      metric1.checkCoefficientOfVariationLowEnough &&
      metric1.checkLatenciesPassNormalityTest &&
      metric2.checkEnoughIterations &&
      metric2.checkCoefficientOfVariationLowEnough &&
      metric2.checkLatenciesPassNormalityTest &&
      checkVarianceLessThanDouble
  }

  val pooledEstimateOfStandardDeviation by lazy {
    val sizeMinusOne1 = metric1.sampleSize - 1
    val sizeMinusOne2 = metric2.sampleSize - 1
    sqrt(
      (sizeMinusOne1 * metric1.variance + sizeMinusOne2 * metric2.variance)
        /
        (sizeMinusOne1 + sizeMinusOne2)
    )
  }

  val standardError by lazy {
    pooledEstimateOfStandardDeviation * sqrt((1.0 / metric1.sampleSize) + (1.0 / metric2.sampleSize))
  }

  fun computeConfidenceInterval(zScore: Double) = ConfidenceInterval(zScore, this)
}

class ConfidenceInterval(
  val zScore: Double,
  val metrics: MetricComparison
) {
  val errorMargin by lazy {
    zScore * metrics.standardError
  }
  val range by lazy {
    errorMargin * 2
  }
  val meanDifference by lazy {
    metrics.metric2.mean - metrics.metric1.mean
  }
  val meanDifferenceRange by lazy {
    (meanDifference - errorMargin).rangeTo(meanDifference + errorMargin)
  }
  val meanDifferencePercentRange by lazy {
    (meanDifferenceRange.start / metrics.metric1.mean).rangeTo(meanDifferenceRange.endInclusive / metrics.metric1.mean)
  }
}

fun compare(
  benchmarkData1: BenchmarksData,
  benchmarkData2: BenchmarksData
): PairedBenchmarkComparison {
  val tests1 = benchmarkData1.benchmarks.associateBy { it.testName }
  val tests2 = benchmarkData2.benchmarks.associateBy { it.testName }
  check(tests1.keys == tests2.keys) {
    "Expected exact same set of tests between ${tests1.keys} and ${tests2.keys}"
  }
  val testsWithPairedData = tests1.mapValues { (testName, benchmark1) ->
    val benchmark2 = tests2.getValue(testName)
    check(benchmark1.metrics.keys == benchmark2.metrics.keys) {
      "Expected exact same set of metrics for $testName between ${benchmark1.metrics.keys} and ${benchmark2.metrics.keys}"
    }
    benchmark1.metrics.mapValues { (metricName, metric1) ->
      val metric2 = benchmark2.metrics.getValue(metricName)
      MetricComparison(metric1, metric2)
    }
  }
  return PairedBenchmarkComparison(benchmarkData1, benchmarkData2, testsWithPairedData)
}
	#!/usr/bin/env kotlin

	@file:Repository("https://repo.maven.apache.org/maven2/")
	@file:DependsOn("com.datumbox:datumbox-framework-lib:0.8.2")
	@file:DependsOn("com.squareup.okio:okio:3.3.0")
	@file:DependsOn("com.squareup.moshi:moshi:1.13.0")
	@file:DependsOn("com.squareup.moshi:moshi-adapters:1.13.0")
	@file:DependsOn("com.squareup.moshi:moshi-kotlin:1.13.0")

	import com.squareup.moshi.Moshi
	import com.squareup.moshi.kotlin.reflect.KotlinJsonAdapterFactory
	import okio.FileSystem
	import okio.Path
	import okio.Path.Companion.toPath
	import okio.buffer
	import kotlin.math.pow
	import com.datumbox.framework.common.dataobjects.FlatDataCollection
	import com.datumbox.framework.core.statistics.nonparametrics.onesample.ShapiroWilk
	import java.text.DecimalFormat
	import kotlin.math.roundToInt
	import kotlin.math.sqrt

	typealias DoubleRange = ClosedFloatingPointRange<Double>

	check(args.size == 2) {
	"Expecting two files."
	}

	val pathToBenchmarkJsonFile1 = args[0].toPath()
	val pathToBenchmarkJsonFile2 = args[1].toPath()

	val analysis1 = args[0].toPath().parseMacrobenchmarkJson()
	val analysis2 = args[1].toPath().parseMacrobenchmarkJson()

	val comparison = compare(analysis1, analysis2)

	for ((testName, metrics) in comparison.metricComparisonsByTest.entries) {
	println("###########################################################################")
	println("Results for $testName")
	for ((metricName, comparison) in metrics.entries) {
	println("##################################################")
	println(metricName)
	// zScore for confidence level 95%
	val zScore = 1.96
	val confidenceInterval = comparison.computeConfidenceInterval(zScore)

	val meanDifferenceRange = confidenceInterval.meanDifferenceRange
	val meanDifferencePercentRange = confidenceInterval.meanDifferencePercentRange

	val twoDecimals = DecimalFormat("#.##")

	println("#########################")
	println("DATA CHECKS")
	if (comparison.allChecksPass) {
	println("✓ All checks passed, the comparison conclusion is meaningful.\n")
	} else {
	println("˟ Some checks did not pass, the comparison conclusion is NOT meaningful.\n")
	}

	println(
	"""
	Data checks for Benchmark 1
	- ${comparison.metric1.checkEnoughIterations.check()} At least 30 iterations (${comparison.metric1.sampleSize})
	- ${comparison.metric1.checkCoefficientOfVariationLowEnough.check()} CV (${twoDecimals.format(comparison.metric1.coefficientOfVariation * 100)}) <= 6%
	- ${comparison.metric1.checkLatenciesPassNormalityTest.check()} Latencies pass normality test

	Data checks for Benchmark 2
	- ${comparison.metric2.checkEnoughIterations.check()} At least 30 iterations (${comparison.metric2.sampleSize})
	- ${comparison.metric2.checkCoefficientOfVariationLowEnough.check()} CV (${twoDecimals.format(comparison.metric2.coefficientOfVariation * 100)}) <= 6%
	- ${comparison.metric2.checkLatenciesPassNormalityTest.check()} Latencies pass normality test

	- ${comparison.checkVarianceLessThanDouble.check()} Variance less than doubles (${twoDecimals.format(comparison.varianceRatio)})
	#########################
	""".trimIndent()
	)

	println("RESULT")
	println("Mean difference confidence interval at 95% confidence level:")
	when {
	0.0 in meanDifferenceRange -> {
	println(
	"The change yielded no statistical significance (the mean difference confidence interval crosses 0): "
	+ "from "
	+ "${meanDifferenceRange.start.roundToInt()} ms (${
	twoDecimals.format(
	meanDifferencePercentRange.start * 100
	)
	}%) "
	+ "to "
	+ "${meanDifferenceRange.endInclusive.roundToInt()} ms (${
	twoDecimals.format(
	meanDifferencePercentRange.endInclusive * 100
	)
	}%)."
	)
	}

	meanDifferenceRange.endInclusive < 0.0 -> {
	println(
	"The change yielded a mean improvement of "
	+ "${meanDifferenceRange.endInclusive.roundToInt()} ms (${
	twoDecimals.format(
	meanDifferencePercentRange.endInclusive * 100
	)
	}%) "
	+ "to "
	+ "${meanDifferenceRange.start.roundToInt()} ms (${
	twoDecimals.format(
	meanDifferencePercentRange.start * 100
	)
	}%)."
	)
	}

	else -> {
	println(
	"The change yielded a mean regression of "
	+ "${meanDifferenceRange.start.roundToInt()} ms (${
	twoDecimals.format(
	meanDifferencePercentRange.start * 100
	)
	}%) "
	+ "to "
	+ "${meanDifferenceRange.endInclusive.roundToInt()} ms (${
	twoDecimals.format(
	meanDifferencePercentRange.endInclusive * 100
	)
	}%)."
	)
	}
	}
	println("#########################")
	println("MEDIANS")
	println("The median went from ${comparison.metric1.median.roundToInt()} ms to ${comparison.metric2.median.roundToInt()} ms.")
	println("DO NOT REPORT THE DIFFERENCE IN MEDIANS.")
	println("This data helps contextualize results but is not statistically meaningful.")
	println("#########################")
	}
	}

	fun Boolean.check() = if (this) "✓" else "˟"

	fun Path.parseMacrobenchmarkJson(): BenchmarksData {
	val jsonSource = FileSystem.SYSTEM
	.source(this)
	.buffer()
	.readUtf8()

	val jsonParser: Moshi = Moshi.Builder()
	.add(KotlinJsonAdapterFactory())
	.build()

	return jsonParser
	.adapter(BenchmarksData::class.java)
	.fromJson(jsonSource) as BenchmarksData
	}

	fun List<Double>.variance(): Double {
	var sum = 0.0
	val mean = average()
	forEach { value ->
	sum += (value - mean).pow(2)
	}
	return sum / size
	}

	data class BenchmarksData(
	val benchmarks: List<Benchmark>
	)

	data class Benchmark(
	val name: String,
	val className: String,
	val metrics: Map<String, Metric>,
	val repeatIterations: Int
	) {

	val testName by lazy {
	"${className}#${name}"
	}
	}

	data class Metric(
	val runs: List<Double>
	) {

	val mean by lazy {
	runs.average()
	}
	val median by lazy {
	runs.p(50)
	}
	val variance by lazy {
	runs.variance()
	}
	val sampleSize: Int
	get() = runs.size

	val standardDeviation by lazy {
	sqrt(variance)
	}
	val coefficientOfVariation by lazy {
	standardDeviation / mean
	}

	val checkEnoughIterations by lazy {
	sampleSize >= 30
	}
	val checkLatenciesPassNormalityTest by lazy {
	// null hypothesis: the distribution is normal.
	// alpha level (5%): probability of wrongly rejecting the hypothesis that the distribution is normal (null hypothesis).
	val alphaLevel = 0.05
	val rejectNullHypothesis = ShapiroWilk.test(FlatDataCollection(runs), alphaLevel)
	!rejectNullHypothesis
	}
	val checkCoefficientOfVariationLowEnough by lazy {
	coefficientOfVariation <= 0.06
	}

	private fun List<Double>.variance(): Double {
	var sum = 0.0
	val mean = average()
	forEach { value ->
	sum += (value - mean).pow(2)
	}
	return sum / size
	}

	private fun List<Double>.p(percentile: Int): Double {
	val idealIndex = percentile.coerceIn(0, 100) / 100.0 * (size - 1)
	val firstIndex = idealIndex.toInt()
	val secondIndex = firstIndex + 1

	val firstValue = this[firstIndex]
	val secondValue = getOrElse(secondIndex) { firstValue }
	return lerp(firstValue, secondValue, idealIndex - firstIndex)
	}

	/** lerp is a classic function name for linear interpolation */
	private fun lerp(
	a: Double,
	b: Double,
	ratio: Double
	): Double {
	return (a * (1 - ratio) + b * (ratio))
	}
	}

	data class PairedBenchmarkComparison(
	val benchmarkData1: BenchmarksData,
	val benchmarkData2: BenchmarksData,
	val metricComparisonsByTest: Map<String, Map<String, MetricComparison>>
	)

	data class MetricComparison(
	val metric1: Metric,
	val metric2: Metric
	) {

	val varianceRatio by lazy {
	metric2.variance / metric1.variance
	}
	val checkVarianceLessThanDouble by lazy {
	varianceRatio in 0.5..2.0
	}

	val allChecksPass by lazy {
	metric1.checkEnoughIterations &&
	metric1.checkCoefficientOfVariationLowEnough &&
	metric1.checkLatenciesPassNormalityTest &&
	metric2.checkEnoughIterations &&
	metric2.checkCoefficientOfVariationLowEnough &&
	metric2.checkLatenciesPassNormalityTest &&
	checkVarianceLessThanDouble
	}

	val pooledEstimateOfStandardDeviation by lazy {
	val sizeMinusOne1 = metric1.sampleSize - 1
	val sizeMinusOne2 = metric2.sampleSize - 1
	sqrt(
	(sizeMinusOne1 * metric1.variance + sizeMinusOne2 * metric2.variance)
	/
	(sizeMinusOne1 + sizeMinusOne2)
	)
	}

	val standardError by lazy {
	pooledEstimateOfStandardDeviation * sqrt((1.0 / metric1.sampleSize) + (1.0 / metric2.sampleSize))
	}

	fun computeConfidenceInterval(zScore: Double) = ConfidenceInterval(zScore, this)
	}

	class ConfidenceInterval(
	val zScore: Double,
	val metrics: MetricComparison
	) {
	val errorMargin by lazy {
	zScore * metrics.standardError
	}
	val range by lazy {
	errorMargin * 2
	}
	val meanDifference by lazy {
	metrics.metric2.mean - metrics.metric1.mean
	}
	val meanDifferenceRange by lazy {
	(meanDifference - errorMargin).rangeTo(meanDifference + errorMargin)
	}
	val meanDifferencePercentRange by lazy {
	(meanDifferenceRange.start / metrics.metric1.mean).rangeTo(meanDifferenceRange.endInclusive / metrics.metric1.mean)
	}
	}

	fun compare(
	benchmarkData1: BenchmarksData,
	benchmarkData2: BenchmarksData
	): PairedBenchmarkComparison {
	val tests1 = benchmarkData1.benchmarks.associateBy { it.testName }
	val tests2 = benchmarkData2.benchmarks.associateBy { it.testName }
	check(tests1.keys == tests2.keys) {
	"Expected exact same set of tests between ${tests1.keys} and ${tests2.keys}"
	}
	val testsWithPairedData = tests1.mapValues { (testName, benchmark1) ->
	val benchmark2 = tests2.getValue(testName)
	check(benchmark1.metrics.keys == benchmark2.metrics.keys) {
	"Expected exact same set of metrics for $testName between ${benchmark1.metrics.keys} and ${benchmark2.metrics.keys}"
	}
	benchmark1.metrics.mapValues { (metricName, metric1) ->
	val metric2 = benchmark2.metrics.getValue(metricName)
	MetricComparison(metric1, metric2)
	}
	}
	return PairedBenchmarkComparison(benchmarkData1, benchmarkData2, testsWithPairedData)
	}