Skip to content

Instantly share code, notes, and snippets.

@pkkm
Created April 16, 2018 21:37
Show Gist options
  • Save pkkm/07f6a5c902f8d343d25524e87c8c76db to your computer and use it in GitHub Desktop.
Save pkkm/07f6a5c902f8d343d25524e87c8c76db to your computer and use it in GitHub Desktop.
Testing the reliability of various statistics in the Criterion library.
#!/usr/bin/env Rscript
library(ggplot2)
library(reshape2)
library(pander)
data <- read.csv("results/results.csv", check.names=FALSE)
# Reorder columns for readability.
col_order <- c("Least-squares slope", "Theil-Sen slope",
"Mean", "Median of means",
"Minimum of means", "Quartile 1 of means", "Quartile 3 of means")
data <- data[, col_order]
molten <- melt(data)
# Assign categories to variables.
molten$type <- factor(
"This should never be visible",
levels=c("Central tendency", "Regression", "Other", "This should never be visible"))
molten <- within(molten, type[variable == "Least-squares slope" | variable == "Theil-Sen slope"] <- "Regression")
molten <- within(molten, type[variable == "Mean" | variable == "Median of means"] <- "Central tendency")
molten <- within(molten, type[variable == "Quartile 1 of means" | variable == "Quartile 3 of means" | variable == "Minimum of means"] <- "Other")
# Draw densities.
plot <- ggplot(molten, aes(x=value, color=variable)) +
geom_density(adjust=0.5) +
labs(x="Time [s]", y="Number of benchmarks (smoothed)", color="") +
facet_wrap("type", scales="fixed", ncol=1)
ggsave("results/density.pdf", plot, device=cairo_pdf, width=8, height=6)
# Draw boxplots.
plot <- ggplot(molten, aes(x=variable, y=value)) +
geom_boxplot() +
labs(x="Statistic", y="Time [s]") +
theme(axis.text.x=element_text(angle=25, hjust=1))
png(filename="results/boxplot.png", type="cairo", width=1100, height=1100, units="px", res=200)
print(plot)
dev.off()
# Data range as a single number (instead of vector of min and max).
range_num <- function(data) {
return(diff(range(data)))
}
# Summarize the spread of the data in a table.
df <- data.frame()
df[ncol(data),] <- NA
rownames(df) <- names(data)
iqr_rel <- apply(data, 2, IQR) / apply(data, 2, median)
df$`IQR/Median` <- sprintf("%.1f%%", unlist(iqr_rel * 100))
range_rel <- apply(data, 2, range_num) / apply(data, 2, median)
df$`Range/Median` <- sprintf("%.1f%%", unlist(range_rel * 100))
table <- pandoc.table.return(
df, style="rmarkdown", justify=c("right", "left", "left"), emphasize.rownames=FALSE)
handle <- file("results/summary.md")
writeLines(table, handle)
close(handle)
#!/usr/bin/env python3
import argparse
import json
import os
import subprocess
import tempfile
import numpy
import scipy.stats
def format_s(seconds):
"""Format a time in seconds like Criterion does."""
if seconds < 0:
return "-" + format_s(-seconds)
def format_with_prefix(seconds, prefix):
"""Format to 4 digits, even if they are trailing zeros."""
if seconds >= 1e9:
return "{:.4g} {}".format(seconds, prefix)
for exponent in [3, 2, 1]:
if seconds >= 10 ** exponent:
return "{1:.{0}f} {2}".format(3 - exponent, seconds, prefix)
return "{:.3f} {}".format(seconds, prefix)
PREFIXES = [
(0, ""),
(-3, "m"),
(-3, "m"),
(-6, "μ"),
(-9, "n"),
(-12, "p"),
(-15, "f"),
(-18, "a")]
for exponent, prefix in PREFIXES:
if seconds >= 10 ** exponent:
return format_with_prefix(seconds * 10 ** (-exponent), prefix + "s")
return "{:g} s".format(seconds)
def format_row(a, b, c=""):
"""Format a row of output."""
return "{: <20} {: <10} {}".format(a, b, c)
def criterion_print_extra_stats(benchmark):
"""Print some extra statistics that Criterion doesn't provide.
`benchmark` should be a parsed JSON object describing a single benchmark
from Criterion's output (tested on Criterion 1.2.3)."""
# Extract columns which are interesting and should be non-null.
keys = benchmark["reportKeys"]
indices = {name: index for index, name in enumerate(keys)}
def process(datum):
return {key: datum[indices[key]]
for key in ["time", "cpuTime", "iters"]}
measured = list(map(process, benchmark["reportMeasured"]))
# Criterion repeatedly executes the benchmarked code in a loop with an
# increasing number of iterations. `time` and `cpuTime` are totals for the
# loop and `iters` is the number of iterations.
mean_times = [datum["time"] / datum["iters"] for datum in measured]
print(format_row(
"quartiles of means",
", ".join(
format_s(numpy.percentile(mean_times, p))
for p in [25, 50, 75])))
# Theil-Sen regression of time vs. number of iterations.
slope, intercept, *_ = scipy.stats.theilslopes(
[m["time"] for m in measured], [m["iters"] for m in measured])
print(format_row(
"Theil-Sen",
format_s(slope),
"(intercept: {})".format(format_s(intercept))))
print(format_row("min of means", format_s(numpy.amin(mean_times))))
def criterion_benchmark(command, time_limit_s=None):
"""Benchmark a shell command using Criterion and print the results."""
with tempfile.TemporaryDirectory(prefix="benchmark-") as dir_name:
json_file = os.path.join(dir_name, "criterion-out.json")
bench_command = ["bench"]
if time_limit_s is not None:
bench_command += ["--time-limit", str(time_limit_s)]
bench_command += ["--json", json_file, "--", command]
process = subprocess.run(bench_command, stdout=subprocess.PIPE)
print(process.stdout.decode("utf-8").rstrip("\n"))
with open(json_file, "r") as f:
data = json.load(f)
data = data[2] # Skip the header.
assert len(data) == 1 # We're always doing a single benchmark.
criterion_print_extra_stats(data[0])
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("command", help="sh command to benchmark")
parser.add_argument(
"--time-limit", type=int, default=60,
help="time limit in seconds for the whole benchmark")
args = parser.parse_args()
criterion_benchmark(args.command, time_limit_s=args.time_limit)
#!/usr/bin/env bash
command='bash -c "a=0; for i in {1..500000}; do (( a += RANDOM )); done"'
n_warmup_runs=5
n_benchmarks=60
single_benchmark_time=60
require_cmd_present() {
for cmd in "$@"; do
if ! command -v -- "$cmd" >/dev/null 2>&1; then
printf "ERROR: Required command \`%s\` not found.\n" "$cmd" 1>&2
exit 1
fi
done
}
require_cmd_present python3 grep cut tr sed bench Rscript
mkdir -p "results"
single_benchmark() {
./benchmark.py "$command" --time-limit "$single_benchmark_time" |
grep -E "time|mean|quartile|Theil|min" |
cut -c 22- |
cut -d\( -f1 |
tr -cd ".,\n0-9" |
tr "\n" "," |
sed "s/,$//"
printf "\n"
}
{
for ((i=0; i<"$n_warmup_runs"; i++)); do
single_benchmark >/dev/null
done
echo "Least-squares slope,Mean,Quartile 1 of means,Median of means,Quartile 3 of means,Theil-Sen slope,Minimum of means"
for ((i=0; i<"$n_benchmarks"; i++)); do
single_benchmark
done
} >"results/results.csv"
./analyze.R
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment