Skip to content

Instantly share code, notes, and snippets.

@jtg567
Created March 8, 2018 00:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jtg567/e91d3c5a676324e0698b1975838e60c7 to your computer and use it in GitHub Desktop.
Save jtg567/e91d3c5a676324e0698b1975838e60c7 to your computer and use it in GitHub Desktop.
HTTP response throttling v2 (Pref Flip)
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# In[1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
from collections import defaultdict as dd
import datetime as DT
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
import itertools
from moztelemetry.dataset import Dataset
from moztelemetry import get_pings_properties
from scipy.stats import mannwhitneyu
from __future__ import division
py_max = __builtin__.max
py_map = __builtin__.map
get_ipython().magic(u'pylab inline')
## these ones pull the data
#
def recursive_get(d, keys):
if len(keys) == 1:
return d.get(keys[0],{})
return recursive_get(d.get(keys[0],{}), keys[1:])
def extract_probes(p):
branch = p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")
output = []
for path, probe_names in experiment_probes.iteritems():
for probe_name in probe_names:
probe = recursive_get(p, path.split("/")).get(probe_name, {})
#take all entries in histogram right now. this is probably problematic. inspect more
for k,v in probe.get("values",{}).iteritems():
if v <= sys.maxint:
output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v))
return output
def exptPings(slug, samp, start= DT.date.today().strftime("%Y%m%d"), end= DT.date.today().strftime("%Y%m%d")):
# returns a spark df with values from probe dictionary by branch and nothing else, default start/end dates are today
cohorts = Dataset.from_source("telemetry-cohorts")
main_pings = cohorts.where(submissionDate = lambda x: x >= start and x <= end).where(experimentId= slug).where(docType= "main").records(sc, sample= samp)
#main_pings.cache() this was in ilana's ognb, mreid advised to remove it for the exception I was hitting
probe_dicts = main_pings.flatMap(extract_probes)
return sqlContext.createDataFrame(probe_dicts.map(lambda d: Row(**d)))
## these ones do the analysis
#
###### get the bins we use for the histogram for a probe by looking at all branches
def get_bins(probe_name, logscale=False):
all_branches = [r.val for r in df.where(df.probe == probe_name).collect()]
#remove top 0.5%, bottom 0.5% for easy outlier
trim = int(len(all_branches)/200.0)
all_branches_trimmed = sorted(all_branches)
all_branches_trimmed = all_branches_trimmed[trim:-1*trim]
if logscale:
if all_branches_trimmed[0] < 1:
all_branches_trimmed = py_map(lambda d: d+1, all_branches_trimmed)
return list(np.linspace(np.log10(all_branches_trimmed[1]), np.log10(all_branches_trimmed[-1]), 10))
n,b = np.histogram(all_branches_trimmed,10)
return b
# get values for branch of experiment for pref, and trim off outliers
def get_vals(pref_name, branch, samp, seed=None):
x_vals = [r.val for r in df.where(df.probe == pref_name) .where(df.branch == branch) .sample(False, samp, seed).collect()]
trim = int(len(x_vals)/200.0)
x_trimmed = sorted(x_vals)[trim:-1*trim]
return x_trimmed
def median(lst):
return lst[(len(lst))/2]
# return (pval, direction) if significant p value for mannwhitneyu vs control
def test_unequal(branch_vals, control_vals, p_threshold=.05):
try:
r = mannwhitneyu(branch_vals, control_vals)
except:
return None, None
prefix = ""
if r.pvalue < p_threshold:
prefix = "***"
if median(branch_vals) > median(control_vals):
return (prefix + str(r.pvalue), "> control")
return (prefix + str(r.pvalue), "< control")
# return (proceed bool, reason)
def can_chart_pref(pref_name):
n = df.where(df.probe == pref_name).count()
if n==0:
return (False, "0 entries for pref %s"%pref_name)
elif n>100000000:
return (False, "%i values for pref %s"%(n,pref_name))
return (True, None)
# chart histograms for all branches of a probe, log/std, and calculate if any branches vary from the mean
def chart_pref(pref_name, logscale, samp):
sig_branches = []
fig, axarr = plt.subplots(n_branches, 1, sharex=True, sharey= 'col')
b = get_bins(pref_name, logscale)
plt.tight_layout()
control_vals = get_vals(pref_name, "Control", samp, 666)
for i in range(n_branches):
if branches[i] == "Control":
x = control_vals
else:
x = get_vals(pref_name, branches[i], samp, 666)
if logscale: #always assume 0 as lowest val for now
x_trans = py_map(lambda d: d+1, x)
ap,bp,cp = axarr[i].hist(np.log(x_trans), bins=b)
else:
axarr[i].hist(x, bins=b)
axarr[i].set_title(branches[i])
if branches[i] == "Control": continue
print "len(branch_vals) = " + str(len(x)) + ", len(control_vals) = " + str(len(control_vals))
if len(x) != 0 | len(control_vals) != 0:
# p, direction = test_unequal(x, control_vals)
# if p is not None and p.startswith("***"):
# print branches[i], p #, direction
print pref_name + " (logscale=" + str(logscale) + ")" + ":"
else:
print pref_name + " branch with no values"
continue
plt.show()
from statsmodels.distributions.empirical_distribution import ECDF
def chart_ecdfs(pref_name, samp, *percentiles):
legend = []
for i in range(n_branches):
legend_entry = ""
v = get_vals(pref_name, branches[i], samp, 666)
if len(v) == 0: continue
cdf = ECDF(v)
curr_plot = plt.plot(cdf.x, cdf.y)
curr_color = curr_plot[0].get_color()
legend_entry += branches[i]
for pct in percentiles:
p = np.percentile(np.array(v), pct)
plt.scatter(p, pct/100.0, facecolors = "none", edgecolors = curr_color, label="_nolegend_")
legend_entry += ", " + str(pct) + "th percentile=" + str(p)
legend.append(legend_entry)
plt.legend(legend, bbox_to_anchor=[1, .5], loc='center left')
plt.show()
EXPERIMENT_SLUG = "pref-flip-http-response-throttling-algo-v2-beta-1434388"
branches = ['Control', 'Variant']
n_branches = len(branches)
# In[2]:
experiment_probes = {}
experiment_probes["payload/processes/content/histograms"] = ["TIME_TO_DOM_INTERACTIVE_MS"]
probe_names = list(itertools.chain(*experiment_probes.values()))
df = exptPings(EXPERIMENT_SLUG, 0.05, "20180220", "20180305")
for p in probe_names:
chart_pref(p, True, 1.0)
chart_ecdfs(p, 1.0, 50, 95)
# In[3]:
experiment_probes = {}
experiment_probes["payload/processes/content/histograms"] = ["TIME_TO_NON_BLANK_PAINT_NETOPT_MS"]
probe_names = list(itertools.chain(*experiment_probes.values()))
df = exptPings(EXPERIMENT_SLUG, 0.1, "20180220", "20180305")
for p in probe_names:
chart_pref(p, True, 1.0)
chart_ecdfs(p, 1.0, 50, 95)
# In[4]:
experiment_probes = {}
experiment_probes["payload/processes/content/histograms"] = ["TIME_TO_DOM_CONTENT_LOADED_START_ACTIVE_NETOPT_MS"]
probe_names = list(itertools.chain(*experiment_probes.values()))
df = exptPings(EXPERIMENT_SLUG, 0.1, "20180220", "20180305")
for p in probe_names:
chart_pref(p, True, 1.0)
chart_ecdfs(p, 1.0, 50, 95)
# In[8]:
experiment_probes = {}
experiment_probes["payload/processes/content/histograms"] = ["TIME_TO_LOAD_EVENT_START_ACTIVE_NETOPT_MS"]
probe_names = list(itertools.chain(*experiment_probes.values()))
df = exptPings(EXPERIMENT_SLUG, 0.1, "20180220", "20180305")
for p in probe_names:
chart_pref(p, True, 1.0)
chart_ecdfs(p, 1.0, 50, 95)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment