Created
March 8, 2018 00:39
-
-
Save jtg567/e91d3c5a676324e0698b1975838e60c7 to your computer and use it in GitHub Desktop.
HTTP response throttling v2 (Pref Flip)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[1]: | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
from scipy.stats import chi2_contingency | |
from scipy.stats import ttest_ind | |
from collections import defaultdict as dd | |
import datetime as DT | |
from pyspark.sql import Row | |
from pyspark.sql import SQLContext | |
from pyspark.sql.types import * | |
from pyspark.sql.functions import * | |
import itertools | |
from moztelemetry.dataset import Dataset | |
from moztelemetry import get_pings_properties | |
from scipy.stats import mannwhitneyu | |
from __future__ import division | |
py_max = __builtin__.max | |
py_map = __builtin__.map | |
get_ipython().magic(u'pylab inline') | |
## these ones pull the data | |
# | |
def recursive_get(d, keys): | |
if len(keys) == 1: | |
return d.get(keys[0],{}) | |
return recursive_get(d.get(keys[0],{}), keys[1:]) | |
def extract_probes(p): | |
branch = p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning") | |
output = [] | |
for path, probe_names in experiment_probes.iteritems(): | |
for probe_name in probe_names: | |
probe = recursive_get(p, path.split("/")).get(probe_name, {}) | |
#take all entries in histogram right now. this is probably problematic. inspect more | |
for k,v in probe.get("values",{}).iteritems(): | |
if v <= sys.maxint: | |
output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v)) | |
return output | |
def exptPings(slug, samp, start= DT.date.today().strftime("%Y%m%d"), end= DT.date.today().strftime("%Y%m%d")): | |
# returns a spark df with values from probe dictionary by branch and nothing else, default start/end dates are today | |
cohorts = Dataset.from_source("telemetry-cohorts") | |
main_pings = cohorts.where(submissionDate = lambda x: x >= start and x <= end).where(experimentId= slug).where(docType= "main").records(sc, sample= samp) | |
#main_pings.cache() this was in ilana's ognb, mreid advised to remove it for the exception I was hitting | |
probe_dicts = main_pings.flatMap(extract_probes) | |
return sqlContext.createDataFrame(probe_dicts.map(lambda d: Row(**d))) | |
## these ones do the analysis | |
# | |
###### get the bins we use for the histogram for a probe by looking at all branches | |
def get_bins(probe_name, logscale=False): | |
all_branches = [r.val for r in df.where(df.probe == probe_name).collect()] | |
#remove top 0.5%, bottom 0.5% for easy outlier | |
trim = int(len(all_branches)/200.0) | |
all_branches_trimmed = sorted(all_branches) | |
all_branches_trimmed = all_branches_trimmed[trim:-1*trim] | |
if logscale: | |
if all_branches_trimmed[0] < 1: | |
all_branches_trimmed = py_map(lambda d: d+1, all_branches_trimmed) | |
return list(np.linspace(np.log10(all_branches_trimmed[1]), np.log10(all_branches_trimmed[-1]), 10)) | |
n,b = np.histogram(all_branches_trimmed,10) | |
return b | |
# get values for branch of experiment for pref, and trim off outliers | |
def get_vals(pref_name, branch, samp, seed=None): | |
x_vals = [r.val for r in df.where(df.probe == pref_name) .where(df.branch == branch) .sample(False, samp, seed).collect()] | |
trim = int(len(x_vals)/200.0) | |
x_trimmed = sorted(x_vals)[trim:-1*trim] | |
return x_trimmed | |
def median(lst): | |
return lst[(len(lst))/2] | |
# return (pval, direction) if significant p value for mannwhitneyu vs control | |
def test_unequal(branch_vals, control_vals, p_threshold=.05): | |
try: | |
r = mannwhitneyu(branch_vals, control_vals) | |
except: | |
return None, None | |
prefix = "" | |
if r.pvalue < p_threshold: | |
prefix = "***" | |
if median(branch_vals) > median(control_vals): | |
return (prefix + str(r.pvalue), "> control") | |
return (prefix + str(r.pvalue), "< control") | |
# return (proceed bool, reason) | |
def can_chart_pref(pref_name): | |
n = df.where(df.probe == pref_name).count() | |
if n==0: | |
return (False, "0 entries for pref %s"%pref_name) | |
elif n>100000000: | |
return (False, "%i values for pref %s"%(n,pref_name)) | |
return (True, None) | |
# chart histograms for all branches of a probe, log/std, and calculate if any branches vary from the mean | |
def chart_pref(pref_name, logscale, samp): | |
sig_branches = [] | |
fig, axarr = plt.subplots(n_branches, 1, sharex=True, sharey= 'col') | |
b = get_bins(pref_name, logscale) | |
plt.tight_layout() | |
control_vals = get_vals(pref_name, "Control", samp, 666) | |
for i in range(n_branches): | |
if branches[i] == "Control": | |
x = control_vals | |
else: | |
x = get_vals(pref_name, branches[i], samp, 666) | |
if logscale: #always assume 0 as lowest val for now | |
x_trans = py_map(lambda d: d+1, x) | |
ap,bp,cp = axarr[i].hist(np.log(x_trans), bins=b) | |
else: | |
axarr[i].hist(x, bins=b) | |
axarr[i].set_title(branches[i]) | |
if branches[i] == "Control": continue | |
print "len(branch_vals) = " + str(len(x)) + ", len(control_vals) = " + str(len(control_vals)) | |
if len(x) != 0 | len(control_vals) != 0: | |
# p, direction = test_unequal(x, control_vals) | |
# if p is not None and p.startswith("***"): | |
# print branches[i], p #, direction | |
print pref_name + " (logscale=" + str(logscale) + ")" + ":" | |
else: | |
print pref_name + " branch with no values" | |
continue | |
plt.show() | |
from statsmodels.distributions.empirical_distribution import ECDF | |
def chart_ecdfs(pref_name, samp, *percentiles): | |
legend = [] | |
for i in range(n_branches): | |
legend_entry = "" | |
v = get_vals(pref_name, branches[i], samp, 666) | |
if len(v) == 0: continue | |
cdf = ECDF(v) | |
curr_plot = plt.plot(cdf.x, cdf.y) | |
curr_color = curr_plot[0].get_color() | |
legend_entry += branches[i] | |
for pct in percentiles: | |
p = np.percentile(np.array(v), pct) | |
plt.scatter(p, pct/100.0, facecolors = "none", edgecolors = curr_color, label="_nolegend_") | |
legend_entry += ", " + str(pct) + "th percentile=" + str(p) | |
legend.append(legend_entry) | |
plt.legend(legend, bbox_to_anchor=[1, .5], loc='center left') | |
plt.show() | |
EXPERIMENT_SLUG = "pref-flip-http-response-throttling-algo-v2-beta-1434388" | |
branches = ['Control', 'Variant'] | |
n_branches = len(branches) | |
# In[2]: | |
experiment_probes = {} | |
experiment_probes["payload/processes/content/histograms"] = ["TIME_TO_DOM_INTERACTIVE_MS"] | |
probe_names = list(itertools.chain(*experiment_probes.values())) | |
df = exptPings(EXPERIMENT_SLUG, 0.05, "20180220", "20180305") | |
for p in probe_names: | |
chart_pref(p, True, 1.0) | |
chart_ecdfs(p, 1.0, 50, 95) | |
# In[3]: | |
experiment_probes = {} | |
experiment_probes["payload/processes/content/histograms"] = ["TIME_TO_NON_BLANK_PAINT_NETOPT_MS"] | |
probe_names = list(itertools.chain(*experiment_probes.values())) | |
df = exptPings(EXPERIMENT_SLUG, 0.1, "20180220", "20180305") | |
for p in probe_names: | |
chart_pref(p, True, 1.0) | |
chart_ecdfs(p, 1.0, 50, 95) | |
# In[4]: | |
experiment_probes = {} | |
experiment_probes["payload/processes/content/histograms"] = ["TIME_TO_DOM_CONTENT_LOADED_START_ACTIVE_NETOPT_MS"] | |
probe_names = list(itertools.chain(*experiment_probes.values())) | |
df = exptPings(EXPERIMENT_SLUG, 0.1, "20180220", "20180305") | |
for p in probe_names: | |
chart_pref(p, True, 1.0) | |
chart_ecdfs(p, 1.0, 50, 95) | |
# In[8]: | |
experiment_probes = {} | |
experiment_probes["payload/processes/content/histograms"] = ["TIME_TO_LOAD_EVENT_START_ACTIVE_NETOPT_MS"] | |
probe_names = list(itertools.chain(*experiment_probes.values())) | |
df = exptPings(EXPERIMENT_SLUG, 0.1, "20180220", "20180305") | |
for p in probe_names: | |
chart_pref(p, True, 1.0) | |
chart_ecdfs(p, 1.0, 50, 95) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment