jtg567/HTTP response throttling v2 (Pref Flip).ipynb Secret

## HTTP response throttling v2 (Pref Flip).ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              HTTP response throttling v2 (Pref Flip).ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## HTTP response throttling v2 (Pref Flip).py

# coding: utf-8

# In[1]:

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
from collections import defaultdict as dd

import datetime as DT

from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
import itertools

from moztelemetry.dataset import Dataset
from moztelemetry import get_pings_properties

from scipy.stats import mannwhitneyu
from __future__ import division

py_max = __builtin__.max
py_map = __builtin__.map

get_ipython().magic(u'pylab inline')

## these ones pull the data
#
def recursive_get(d, keys):
    if len(keys) == 1:
        return d.get(keys[0],{})
    return recursive_get(d.get(keys[0],{}), keys[1:])

def extract_probes(p):
    branch = p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")

    output = []
    for path, probe_names in experiment_probes.iteritems():
        for probe_name in probe_names:
            probe = recursive_get(p, path.split("/")).get(probe_name, {})
            #take all entries in histogram right now. this is probably problematic. inspect more
            for k,v in probe.get("values",{}).iteritems():
                if v <= sys.maxint:
                    output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v))
    return output

def exptPings(slug, samp, start= DT.date.today().strftime("%Y%m%d"), end= DT.date.today().strftime("%Y%m%d")):
    # returns a spark df with values from probe dictionary by branch and nothing else, default start/end dates are today
    cohorts = Dataset.from_source("telemetry-cohorts")
    main_pings = cohorts.where(submissionDate = lambda x: x >= start and x <= end).where(experimentId= slug).where(docType= "main").records(sc, sample= samp)
    #main_pings.cache() this was in ilana's ognb, mreid advised to remove it for the exception I was hitting

    probe_dicts = main_pings.flatMap(extract_probes)
    return sqlContext.createDataFrame(probe_dicts.map(lambda d: Row(**d)))

## these ones do the analysis
#

###### get the bins we use for the histogram for a probe by looking at all branches
def get_bins(probe_name, logscale=False):

    all_branches = [r.val for r in df.where(df.probe == probe_name).collect()]

    #remove top 0.5%, bottom 0.5% for easy outlier
    trim = int(len(all_branches)/200.0)
    all_branches_trimmed = sorted(all_branches)
    all_branches_trimmed = all_branches_trimmed[trim:-1*trim]

    if logscale:
        if all_branches_trimmed[0] < 1:
            all_branches_trimmed = py_map(lambda d: d+1, all_branches_trimmed)
        return list(np.linspace(np.log10(all_branches_trimmed[1]), np.log10(all_branches_trimmed[-1]), 10))

    n,b = np.histogram(all_branches_trimmed,10)
    return b

# get values for branch of experiment for pref, and trim off outliers
def get_vals(pref_name, branch, samp, seed=None):
    x_vals = [r.val for r in df.where(df.probe == pref_name)                               .where(df.branch == branch)                               .sample(False, samp, seed).collect()]

    trim = int(len(x_vals)/200.0)
    x_trimmed = sorted(x_vals)[trim:-1*trim]
    return x_trimmed

def median(lst):
    return lst[(len(lst))/2]

# return (pval, direction) if significant p value for mannwhitneyu vs control
def test_unequal(branch_vals, control_vals, p_threshold=.05):

    try:
        r = mannwhitneyu(branch_vals, control_vals)
    except:
        return None, None

    prefix = ""
    if r.pvalue < p_threshold:
        prefix = "***"

    if median(branch_vals) > median(control_vals):
        return (prefix + str(r.pvalue), "> control")

    return (prefix + str(r.pvalue), "< control")

# return (proceed bool, reason)
def can_chart_pref(pref_name):
    n = df.where(df.probe == pref_name).count()
    if n==0:
        return (False, "0 entries for pref %s"%pref_name)
    elif n>100000000:
        return (False, "%i values for pref %s"%(n,pref_name))
    return (True, None)

# chart histograms for all branches of a probe, log/std, and calculate if any branches vary from the mean
def chart_pref(pref_name, logscale, samp):

    sig_branches = []
    fig, axarr = plt.subplots(n_branches, 1, sharex=True, sharey= 'col')

    b = get_bins(pref_name, logscale)
    plt.tight_layout()

    control_vals = get_vals(pref_name, "Control", samp, 666)

    for i in range(n_branches):

        if branches[i] == "Control":
            x = control_vals

        else:
            x = get_vals(pref_name, branches[i], samp, 666)

        if logscale: #always assume 0 as lowest val for now
            x_trans = py_map(lambda d: d+1, x)
            ap,bp,cp = axarr[i].hist(np.log(x_trans), bins=b)

        else:
            axarr[i].hist(x, bins=b)

        axarr[i].set_title(branches[i])

        if branches[i] == "Control": continue

        print "len(branch_vals) = " + str(len(x)) + ", len(control_vals) = " + str(len(control_vals))
        if len(x) != 0 | len(control_vals) != 0:
#            p, direction = test_unequal(x, control_vals)
#            if p is not None and p.startswith("***"):
#                print branches[i], p #, direction
            print pref_name + " (logscale=" + str(logscale) + ")" + ":"
        else:
            print pref_name + " branch with no values"
            continue

    plt.show()

from statsmodels.distributions.empirical_distribution import ECDF

def chart_ecdfs(pref_name, samp, *percentiles):

    legend = []

    for i in range(n_branches):
        legend_entry = ""
        v = get_vals(pref_name, branches[i], samp, 666)
        if len(v) == 0: continue
        cdf = ECDF(v)
        curr_plot = plt.plot(cdf.x, cdf.y)
        curr_color = curr_plot[0].get_color()
        legend_entry += branches[i]
        for pct in percentiles:
            p = np.percentile(np.array(v), pct)
            plt.scatter(p, pct/100.0, facecolors = "none",                      edgecolors = curr_color, label="_nolegend_")
            legend_entry += ", " + str(pct) + "th percentile=" + str(p)

        legend.append(legend_entry)

    plt.legend(legend, bbox_to_anchor=[1, .5], loc='center left')
    plt.show()

EXPERIMENT_SLUG = "pref-flip-http-response-throttling-algo-v2-beta-1434388"
branches = ['Control', 'Variant']
n_branches = len(branches)


# In[2]:

experiment_probes = {}
experiment_probes["payload/processes/content/histograms"] =  ["TIME_TO_DOM_INTERACTIVE_MS"]
probe_names = list(itertools.chain(*experiment_probes.values()))

df = exptPings(EXPERIMENT_SLUG, 0.05, "20180220", "20180305")

for p in probe_names:
    chart_pref(p, True, 1.0)
    chart_ecdfs(p, 1.0, 50, 95)


# In[3]:

experiment_probes = {}
experiment_probes["payload/processes/content/histograms"] =  ["TIME_TO_NON_BLANK_PAINT_NETOPT_MS"]
probe_names = list(itertools.chain(*experiment_probes.values()))

df = exptPings(EXPERIMENT_SLUG, 0.1, "20180220", "20180305")

for p in probe_names:
    chart_pref(p, True, 1.0)
    chart_ecdfs(p, 1.0, 50, 95)


# In[4]:

experiment_probes = {}
experiment_probes["payload/processes/content/histograms"] =  ["TIME_TO_DOM_CONTENT_LOADED_START_ACTIVE_NETOPT_MS"]
probe_names = list(itertools.chain(*experiment_probes.values()))

df = exptPings(EXPERIMENT_SLUG, 0.1, "20180220", "20180305")

for p in probe_names:
    chart_pref(p, True, 1.0)
    chart_ecdfs(p, 1.0, 50, 95)


# In[8]:

experiment_probes = {}
experiment_probes["payload/processes/content/histograms"] =  ["TIME_TO_LOAD_EVENT_START_ACTIVE_NETOPT_MS"]
probe_names = list(itertools.chain(*experiment_probes.values()))

df = exptPings(EXPERIMENT_SLUG, 0.1, "20180220", "20180305")

for p in probe_names:
    chart_pref(p, True, 1.0)
    chart_ecdfs(p, 1.0, 50, 95)

	# coding: utf-8

	# In[1]:

	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import plotly.plotly as py
	from scipy.stats import chi2_contingency
	from scipy.stats import ttest_ind
	from collections import defaultdict as dd

	import datetime as DT

	from pyspark.sql import Row
	from pyspark.sql import SQLContext
	from pyspark.sql.types import *
	from pyspark.sql.functions import *
	import itertools

	from moztelemetry.dataset import Dataset
	from moztelemetry import get_pings_properties

	from scipy.stats import mannwhitneyu
	from __future__ import division

	py_max = __builtin__.max
	py_map = __builtin__.map

	get_ipython().magic(u'pylab inline')

	## these ones pull the data
	#
	def recursive_get(d, keys):
	if len(keys) == 1:
	return d.get(keys[0],{})
	return recursive_get(d.get(keys[0],{}), keys[1:])

	def extract_probes(p):
	branch = p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")

	output = []
	for path, probe_names in experiment_probes.iteritems():
	for probe_name in probe_names:
	probe = recursive_get(p, path.split("/")).get(probe_name, {})
	#take all entries in histogram right now. this is probably problematic. inspect more
	for k,v in probe.get("values",{}).iteritems():
	if v <= sys.maxint:
	output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v))
	return output

	def exptPings(slug, samp, start= DT.date.today().strftime("%Y%m%d"), end= DT.date.today().strftime("%Y%m%d")):
	# returns a spark df with values from probe dictionary by branch and nothing else, default start/end dates are today
	cohorts = Dataset.from_source("telemetry-cohorts")
	main_pings = cohorts.where(submissionDate = lambda x: x >= start and x <= end).where(experimentId= slug).where(docType= "main").records(sc, sample= samp)
	#main_pings.cache() this was in ilana's ognb, mreid advised to remove it for the exception I was hitting

	probe_dicts = main_pings.flatMap(extract_probes)
	return sqlContext.createDataFrame(probe_dicts.map(lambda d: Row(**d)))

	## these ones do the analysis
	#

	###### get the bins we use for the histogram for a probe by looking at all branches
	def get_bins(probe_name, logscale=False):

	all_branches = [r.val for r in df.where(df.probe == probe_name).collect()]

	#remove top 0.5%, bottom 0.5% for easy outlier
	trim = int(len(all_branches)/200.0)
	all_branches_trimmed = sorted(all_branches)
	all_branches_trimmed = all_branches_trimmed[trim:-1*trim]

	if logscale:
	if all_branches_trimmed[0] < 1:
	all_branches_trimmed = py_map(lambda d: d+1, all_branches_trimmed)
	return list(np.linspace(np.log10(all_branches_trimmed[1]), np.log10(all_branches_trimmed[-1]), 10))

	n,b = np.histogram(all_branches_trimmed,10)
	return b

	# get values for branch of experiment for pref, and trim off outliers
	def get_vals(pref_name, branch, samp, seed=None):
	x_vals = [r.val for r in df.where(df.probe == pref_name) .where(df.branch == branch) .sample(False, samp, seed).collect()]

	trim = int(len(x_vals)/200.0)
	x_trimmed = sorted(x_vals)[trim:-1*trim]
	return x_trimmed

	def median(lst):
	return lst[(len(lst))/2]

	# return (pval, direction) if significant p value for mannwhitneyu vs control
	def test_unequal(branch_vals, control_vals, p_threshold=.05):

	try:
	r = mannwhitneyu(branch_vals, control_vals)
	except:
	return None, None

	prefix = ""
	if r.pvalue < p_threshold:
	prefix = "***"

	if median(branch_vals) > median(control_vals):
	return (prefix + str(r.pvalue), "> control")

	return (prefix + str(r.pvalue), "< control")

	# return (proceed bool, reason)
	def can_chart_pref(pref_name):
	n = df.where(df.probe == pref_name).count()
	if n==0:
	return (False, "0 entries for pref %s"%pref_name)
	elif n>100000000:
	return (False, "%i values for pref %s"%(n,pref_name))
	return (True, None)

	# chart histograms for all branches of a probe, log/std, and calculate if any branches vary from the mean
	def chart_pref(pref_name, logscale, samp):

	sig_branches = []
	fig, axarr = plt.subplots(n_branches, 1, sharex=True, sharey= 'col')

	b = get_bins(pref_name, logscale)
	plt.tight_layout()

	control_vals = get_vals(pref_name, "Control", samp, 666)

	for i in range(n_branches):

	if branches[i] == "Control":
	x = control_vals

	else:
	x = get_vals(pref_name, branches[i], samp, 666)

	if logscale: #always assume 0 as lowest val for now
	x_trans = py_map(lambda d: d+1, x)
	ap,bp,cp = axarr[i].hist(np.log(x_trans), bins=b)

	else:
	axarr[i].hist(x, bins=b)

	axarr[i].set_title(branches[i])

	if branches[i] == "Control": continue

	print "len(branch_vals) = " + str(len(x)) + ", len(control_vals) = " + str(len(control_vals))
	if len(x) != 0 \| len(control_vals) != 0:
	# p, direction = test_unequal(x, control_vals)
	# if p is not None and p.startswith("***"):
	# print branches[i], p #, direction
	print pref_name + " (logscale=" + str(logscale) + ")" + ":"
	else:
	print pref_name + " branch with no values"
	continue

	plt.show()

	from statsmodels.distributions.empirical_distribution import ECDF

	def chart_ecdfs(pref_name, samp, *percentiles):

	legend = []

	for i in range(n_branches):
	legend_entry = ""
	v = get_vals(pref_name, branches[i], samp, 666)
	if len(v) == 0: continue
	cdf = ECDF(v)
	curr_plot = plt.plot(cdf.x, cdf.y)
	curr_color = curr_plot[0].get_color()
	legend_entry += branches[i]
	for pct in percentiles:
	p = np.percentile(np.array(v), pct)
	plt.scatter(p, pct/100.0, facecolors = "none", edgecolors = curr_color, label="_nolegend_")
	legend_entry += ", " + str(pct) + "th percentile=" + str(p)

	legend.append(legend_entry)

	plt.legend(legend, bbox_to_anchor=[1, .5], loc='center left')
	plt.show()

	EXPERIMENT_SLUG = "pref-flip-http-response-throttling-algo-v2-beta-1434388"
	branches = ['Control', 'Variant']
	n_branches = len(branches)


	# In[2]:

	experiment_probes = {}
	experiment_probes["payload/processes/content/histograms"] = ["TIME_TO_DOM_INTERACTIVE_MS"]
	probe_names = list(itertools.chain(*experiment_probes.values()))

	df = exptPings(EXPERIMENT_SLUG, 0.05, "20180220", "20180305")

	for p in probe_names:
	chart_pref(p, True, 1.0)
	chart_ecdfs(p, 1.0, 50, 95)


	# In[3]:

	experiment_probes = {}
	experiment_probes["payload/processes/content/histograms"] = ["TIME_TO_NON_BLANK_PAINT_NETOPT_MS"]
	probe_names = list(itertools.chain(*experiment_probes.values()))

	df = exptPings(EXPERIMENT_SLUG, 0.1, "20180220", "20180305")

	for p in probe_names:
	chart_pref(p, True, 1.0)
	chart_ecdfs(p, 1.0, 50, 95)


	# In[4]:

	experiment_probes = {}
	experiment_probes["payload/processes/content/histograms"] = ["TIME_TO_DOM_CONTENT_LOADED_START_ACTIVE_NETOPT_MS"]
	probe_names = list(itertools.chain(*experiment_probes.values()))

	df = exptPings(EXPERIMENT_SLUG, 0.1, "20180220", "20180305")

	for p in probe_names:
	chart_pref(p, True, 1.0)
	chart_ecdfs(p, 1.0, 50, 95)


	# In[8]:

	experiment_probes = {}
	experiment_probes["payload/processes/content/histograms"] = ["TIME_TO_LOAD_EVENT_START_ACTIVE_NETOPT_MS"]
	probe_names = list(itertools.chain(*experiment_probes.values()))

	df = exptPings(EXPERIMENT_SLUG, 0.1, "20180220", "20180305")

	for p in probe_names:
	chart_pref(p, True, 1.0)
	chart_ecdfs(p, 1.0, 50, 95)