awreece/abtest.py

## abtest.py
#!/usr/bin/python
# -*- coding: utf-8 -*-

import argparse
import blessed
from collections import defaultdict
import csv
import jinja2
import logging
import numpy
import os
import re
from scipy import stats
import subprocess
import sys
from tempfile import NamedTemporaryFile

# Ensure that we can output color escape characters and utf-8.
reload(sys)
sys.setdefaultencoding("utf-8")

term = blessed.Terminal()


def GetMeanStr(values, confidence):
    """Returns a string representing a confidence interval around the mean."""
    error = stats.sem(values)*stats.t.ppf(confidence, len(values)-1)
    return "%.2f±%.2fms" % (numpy.mean(values) / 1000, error / 1000)


def GetBucketChar(count, maxCount):
    blocks = [' ', '▁', '▂', '▃', '▄', '▅', '▆', '▇', '█']
    bi = int(float(count)/float(maxCount)*(len(blocks) - 1))
    #
    # Deliberately show outliers, even if they would not have otherwise
    # appeared.
    #
    if count > 0 and bi == 0:
        bi = 1
    return blocks[bi]


def GetHistogramString(arr, **kwargs):
    buckets, low_range, binsize, extrapoints = stats.histogram(arr, **kwargs)
    hist = "%7.2fms : " % (low_range / 1000)
    for count in buckets:
        hist += GetBucketChar(count, max(buckets))
    hist += " : %7.2fms" % ((low_range + binsize * (len(buckets) + 1)) / 1000)
    return hist


def LogPerformanceStats(args, oldVariant, oldExecutions, newVariant, newExecutions):
    """
    Logs detailed information for the compilation and executions.

    Summary:
        Calculates detailed statistics (including histograms) for each of
        the input arrays and emits them to `logging.info`. Deliberately
        attempts to align the histograms and ensure the both execution
        histograms use the same scale to improve readability.

    Arguments:
        args: The command line arguments containing the histogram parameters
            and the confidence level.
        newExecutions: An array of floats containing new execution times in
            milliseconds.
        oldExecutions: An array of floats containing old execution times in
            milliseconds.
    """
    #
    # We take special care to ensure that the histograms will line up (same
    # size bucket at same point on the screen).
    #
    minExecution = min(min(newExecutions), min(oldExecutions))
    maxExecution = max(max(newExecutions), max(oldExecutions))
    s = (1/2) * (maxExecution - minExecution) / (args.histogram_buckets - 1)

    newExecutionHist = GetHistogramString(
        newExecutions,
        defaultlimits=(minExecution - s, maxExecution + s),
        numbins=args.histogram_buckets)
    oldExecutionHist = GetHistogramString(
        oldExecutions,
        defaultlimits=(minExecution - s, maxExecution + s),
        numbins=args.histogram_buckets)

    newExecutionMeanStr = "μ=" + GetMeanStr(newExecutions, args.confidence)
    oldExecutionMeanStr = "μ=" + GetMeanStr(oldExecutions, args.confidence)
    maxMeanLen = max(len(oldExecutionMeanStr), len(newExecutionMeanStr))

    maxVariantLen = max(len(oldVariant), len(newVariant))

    logging.info(term.blue("%-*s : %-*s : %s"),
                 maxVariantLen, oldVariant, maxMeanLen, oldExecutionMeanStr,
                 oldExecutionHist)
    logging.info(term.cyan("%-*s : %-*s : %s"),
                 maxVariantLen, newVariant, maxMeanLen, newExecutionMeanStr,
                 newExecutionHist)

def DoABTest(args, oldVariant, oldExecutions, newVariant, newExecutions):
    logging.debug("AB Testing  %s (old) vs %s (new)", oldVariant, newVariant)
    testPassed = True

    adjustedOldExecutions = [
        (1 + args.max_regression) * v for v in oldExecutions
    ]

    LogPerformanceStats(args, oldVariant, oldExecutions, newVariant, newExecutions)

    #
    # Run a Welch two sample t test to ensure that we have not regressed
    # execution perf.
    #
    # While this test assumes normality, the Welch's variant does *not*
    # assume homoscedasticity (i.e. both populations have the same
    # variance). Other similar tests, such as the Mann-Whitney U test,
    # are sensitive to this property:
    #
    #   If the distributions are heteroscedastic, the Kruskal–Wallis test
    #   won't help you; instead, you should use Welch's t–test for two
    #   groups, or Welch's anova for more than two groups.
    #
    #   http://www.biostathandbook.com/kruskalwallis.html
    #
    (_, p) = stats.ttest_ind(newExecutions, adjustedOldExecutions,
                             equal_var=False)
    if p < 1 - args.confidence:
        newExecutionsMean = numpy.mean(newExecutions)
        adjustedOldExecutionsMean = numpy.mean(adjustedOldExecutions)
        oldExecutionsMean = numpy.mean(oldExecutions)
        if newExecutionsMean > adjustedOldExecutionsMean:
            testPassed = False
            regression = (newExecutionsMean - oldExecutionsMean)
            regressionPct = (regression / oldExecutionsMean) * 100
            logging.error(
                term.red("Execution regressed by %.1f%% (vs %.1f%%)"),
                regressionPct, args.max_regression*100)
    else:
        logging.debug("Execution had too much variance to make conclusion")

    # TODO Test 99th percentile

    return testPassed

def main():
    parser = argparse.ArgumentParser(
        description="AB variant execution performance (old vs new).",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-v", "--verbosity", action="count",
                        help="Increase output verbosity")

    parser.add_argument("--confidence", type=float, default=0.999,
                        help="Confidence interval (e.g. be 99.9%% confident " +
                        "of all reported values.")
    parser.add_argument("--max-regression", type=float, default=0.02,
                        help="Maximum allowed execution regression (e.g. " +
                        "new execution must within 2%% of old execution).")
    parser.add_argument("--old-variant", help="Test type to use as old test " +
                        "variant. By default, the first test type " +
                        "encountered is the old variant.")

    parser.add_argument("--histogram-buckets", type=int, default=15,
                        help="Number of histogram buckets to use.")

    args = parser.parse_args()

    if args.verbosity >= 1:
        logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    else:
        logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    executions = defaultdict(list)
    oldVariant = args.old_variant
    for name, executionMicros in csv.reader(sys.stdin):
        oldVariant = oldVariant if oldVariant else name
        executions[name].append(float(executionMicros))
    assert len(executions) == 2, "Can only do AB test of two test variants."

    oldExecutions, newExecutions = None, None
    for name, values in executions.iteritems():
        if name == oldVariant:
            oldExecutions = values
        else:
            newExecutions = values
            newVariant = name

    DoABTest(args, oldVariant, oldExecutions, newVariant, newExecutions)


if __name__ == "__main__":
    main()
	#!/usr/bin/python
	# -- coding: utf-8 --

	import argparse
	import blessed
	from collections import defaultdict
	import csv
	import jinja2
	import logging
	import numpy
	import os
	import re
	from scipy import stats
	import subprocess
	import sys
	from tempfile import NamedTemporaryFile

	# Ensure that we can output color escape characters and utf-8.
	reload(sys)
	sys.setdefaultencoding("utf-8")

	term = blessed.Terminal()


	def GetMeanStr(values, confidence):
	"""Returns a string representing a confidence interval around the mean."""
	error = stats.sem(values)*stats.t.ppf(confidence, len(values)-1)
	return "%.2f±%.2fms" % (numpy.mean(values) / 1000, error / 1000)


	def GetBucketChar(count, maxCount):
	blocks = [' ', '▁', '▂', '▃', '▄', '▅', '▆', '▇', '█']
	bi = int(float(count)/float(maxCount)*(len(blocks) - 1))
	#
	# Deliberately show outliers, even if they would not have otherwise
	# appeared.
	#
	if count > 0 and bi == 0:
	bi = 1
	return blocks[bi]


	def GetHistogramString(arr, **kwargs):
	buckets, low_range, binsize, extrapoints = stats.histogram(arr, **kwargs)
	hist = "%7.2fms : " % (low_range / 1000)
	for count in buckets:
	hist += GetBucketChar(count, max(buckets))
	hist += " : %7.2fms" % ((low_range + binsize * (len(buckets) + 1)) / 1000)
	return hist


	def LogPerformanceStats(args, oldVariant, oldExecutions, newVariant, newExecutions):
	"""
	Logs detailed information for the compilation and executions.

	Summary:
	Calculates detailed statistics (including histograms) for each of
	the input arrays and emits them to `logging.info`. Deliberately
	attempts to align the histograms and ensure the both execution
	histograms use the same scale to improve readability.

	Arguments:
	args: The command line arguments containing the histogram parameters
	and the confidence level.
	newExecutions: An array of floats containing new execution times in
	milliseconds.
	oldExecutions: An array of floats containing old execution times in
	milliseconds.
	"""
	#
	# We take special care to ensure that the histograms will line up (same
	# size bucket at same point on the screen).
	#
	minExecution = min(min(newExecutions), min(oldExecutions))
	maxExecution = max(max(newExecutions), max(oldExecutions))
	s = (1/2) * (maxExecution - minExecution) / (args.histogram_buckets - 1)

	newExecutionHist = GetHistogramString(
	newExecutions,
	defaultlimits=(minExecution - s, maxExecution + s),
	numbins=args.histogram_buckets)
	oldExecutionHist = GetHistogramString(
	oldExecutions,
	defaultlimits=(minExecution - s, maxExecution + s),
	numbins=args.histogram_buckets)

	newExecutionMeanStr = "μ=" + GetMeanStr(newExecutions, args.confidence)
	oldExecutionMeanStr = "μ=" + GetMeanStr(oldExecutions, args.confidence)
	maxMeanLen = max(len(oldExecutionMeanStr), len(newExecutionMeanStr))

	maxVariantLen = max(len(oldVariant), len(newVariant))

	logging.info(term.blue("%-s : %-s : %s"),
	maxVariantLen, oldVariant, maxMeanLen, oldExecutionMeanStr,
	oldExecutionHist)
	logging.info(term.cyan("%-s : %-s : %s"),
	maxVariantLen, newVariant, maxMeanLen, newExecutionMeanStr,
	newExecutionHist)

	def DoABTest(args, oldVariant, oldExecutions, newVariant, newExecutions):
	logging.debug("AB Testing %s (old) vs %s (new)", oldVariant, newVariant)
	testPassed = True

	adjustedOldExecutions = [
	(1 + args.max_regression) * v for v in oldExecutions
	]

	LogPerformanceStats(args, oldVariant, oldExecutions, newVariant, newExecutions)

	#
	# Run a Welch two sample t test to ensure that we have not regressed
	# execution perf.
	#
	# While this test assumes normality, the Welch's variant does not
	# assume homoscedasticity (i.e. both populations have the same
	# variance). Other similar tests, such as the Mann-Whitney U test,
	# are sensitive to this property:
	#
	# If the distributions are heteroscedastic, the Kruskal–Wallis test
	# won't help you; instead, you should use Welch's t–test for two
	# groups, or Welch's anova for more than two groups.
	#
	# http://www.biostathandbook.com/kruskalwallis.html
	#
	(_, p) = stats.ttest_ind(newExecutions, adjustedOldExecutions,
	equal_var=False)
	if p < 1 - args.confidence:
	newExecutionsMean = numpy.mean(newExecutions)
	adjustedOldExecutionsMean = numpy.mean(adjustedOldExecutions)
	oldExecutionsMean = numpy.mean(oldExecutions)
	if newExecutionsMean > adjustedOldExecutionsMean:
	testPassed = False
	regression = (newExecutionsMean - oldExecutionsMean)
	regressionPct = (regression / oldExecutionsMean) * 100
	logging.error(
	term.red("Execution regressed by %.1f%% (vs %.1f%%)"),
	regressionPct, args.max_regression*100)
	else:
	logging.debug("Execution had too much variance to make conclusion")

	# TODO Test 99th percentile

	return testPassed

	def main():
	parser = argparse.ArgumentParser(
	description="AB variant execution performance (old vs new).",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument("-v", "--verbosity", action="count",
	help="Increase output verbosity")

	parser.add_argument("--confidence", type=float, default=0.999,
	help="Confidence interval (e.g. be 99.9%% confident " +
	"of all reported values.")
	parser.add_argument("--max-regression", type=float, default=0.02,
	help="Maximum allowed execution regression (e.g. " +
	"new execution must within 2%% of old execution).")
	parser.add_argument("--old-variant", help="Test type to use as old test " +
	"variant. By default, the first test type " +
	"encountered is the old variant.")

	parser.add_argument("--histogram-buckets", type=int, default=15,
	help="Number of histogram buckets to use.")

	args = parser.parse_args()

	if args.verbosity >= 1:
	logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
	else:
	logging.basicConfig(stream=sys.stdout, level=logging.INFO)

	executions = defaultdict(list)
	oldVariant = args.old_variant
	for name, executionMicros in csv.reader(sys.stdin):
	oldVariant = oldVariant if oldVariant else name
	executions[name].append(float(executionMicros))
	assert len(executions) == 2, "Can only do AB test of two test variants."

	oldExecutions, newExecutions = None, None
	for name, values in executions.iteritems():
	if name == oldVariant:
	oldExecutions = values
	else:
	newExecutions = values
	newVariant = name

	DoABTest(args, oldVariant, oldExecutions, newVariant, newExecutions)


	if __name__ == "__main__":
	main()