Skip to content

Instantly share code, notes, and snippets.

@jtg567
Last active August 28, 2017 20:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jtg567/3a69308c67724484369fc3650012b4a3 to your computer and use it in GitHub Desktop.
Save jtg567/3a69308c67724484369fc3650012b4a3 to your computer and use it in GitHub Desktop.
Race Cache 2 via ilana workflow
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# This script is intended for variables not yet available in the [Experiments Viewer](https://moz-experiments-viewer.herokuapp.com/).
# In[1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
from collections import defaultdict as dd
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
import itertools
from moztelemetry.dataset import Dataset
from scipy.stats import mannwhitneyu
py_max = __builtin__.max
py_map = __builtin__.map
get_ipython().magic(u'pylab inline')
# In[2]:
experiment_probes = {}
experiment_probes["payload/histograms"] = [
"NETWORK_RACE_CACHE_WITH_NETWORK_OCEC_ON_START_DIFF",
"NETWORK_RACE_CACHE_WITH_NETWORK_SAVED_TIME",
"NETWORK_RACE_CACHE_WITH_NETWORK_USAGE_2",
"NETWORK_RACE_CACHE_BANDWIDTH_RACE_NETWORK_WIN",
"NETWORK_RACE_CACHE_BANDWIDTH_RACE_CACHE_WIN",
"NETWORK_RACE_CACHE_BANDWIDTH_NOT_RACE",
"NETWORK_RACE_CACHE_VALIDATION",
"HTTP_PAGE_COMPLETE_LOAD_V2",
"HTTP_PAGE_COMPLETE_LOAD_NET_V2",
"HTTP_PAGE_COMPLETE_LOAD_CACHED_V2",
"HTTP_SUB_COMPLETE_LOAD_V2",
"HTTP_SUB_COMPLETE_LOAD_CACHED_V2",
"HTTP_SUB_COMPLETE_LOAD_NET_V2",
"TOTAL_CONTENT_PAGE_LOAD_TIME",
]
probe_names = list(itertools.chain(*experiment_probes.values()))
EXPERIMENT_SLUG = "pref-flip-rcwn2-1381816"
START_DATE = "20170726" #YYYYMMDD
END_DATE = "20170802" #YYYYMMDD
# In[3]:
cohorts = Dataset.from_source("telemetry-cohorts")
pings = cohorts.where(submissionDate = lambda x: x >= START_DATE and x <= END_DATE) .where(experimentId=EXPERIMENT_SLUG) .records(sc, sample=0.01)
#pings.cache()
#pings.count()
# In[4]:
main_pings = pings.filter(lambda x: x["meta"]["docType"]=="main")
main_pings.cache()
# In[ ]:
# haven't run this yet - trying to aggregate histograms per Client instead of collapsing over them
byClient = main_pings.map(lambda p: (p['clientId'], [p])).reduceByKey(lambda x,y: x+y)
#modified from https://stackoverflow.com/questions/13852896/using-a-single-variable-to-index-into-nested-dictionaries
def recursive_get(d, keys):
if len(keys) == 1:
return d.get(keys[0],{})
return recursive_get(d.get(keys[0],{}), keys[1:])
def extract_probes(i):
clientId, pinglist = i
# determine which branch this client was in and note if it ended up in both
branch = []
for p in pinglist:
branch.extend(p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning"))
if len(np.unique(branch)) > 1:
branch = "both"
else:
branch = np.unique(branch)[0]
# if you input by client, pinglist instead of ping add another layer of iteration over the list here
output = []
for p in pinglist:
for path, probe_names in experiment_probes.iteritems():
for probe_name in probe_names:
probe = recursive_get(p, path.split("/")).get(probe_name, {})
for k,v in probe.get("values",{}).iteritems():
output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v))
# by this point you should have all this clients pings in an aggregate histogram, right? what then?
# return output
fin = byClient.flatMap(extract_probes)
# In[5]:
#modified from https://stackoverflow.com/questions/13852896/using-a-single-variable-to-index-into-nested-dictionaries
def recursive_get(d, keys):
if len(keys) == 1:
return d.get(keys[0],{})
return recursive_get(d.get(keys[0],{}), keys[1:])
def extract_probes(p):
branch = p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")
output = []
for path, probe_names in experiment_probes.iteritems():
for probe_name in probe_names:
probe = recursive_get(p, path.split("/")).get(probe_name, {})
#take all entries in histogram right now. this is probably problematic. inspect more
for k,v in probe.get("values",{}).iteritems():
output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v))
return output
probe_dicts = main_pings.flatMap(extract_probes)
# In[6]:
df = sqlContext.createDataFrame(probe_dicts.map(lambda d: Row(**d)))
# In[7]:
#check on branch distribution
for b,v in main_pings.map(lambda x: x.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")).countByValue().iteritems():
print b, ":", v
# In[8]:
branches = [i.branch for i in df.select("branch").distinct().collect()]
n_branches = len(branches)
#for this one - set order
branches = ["control", "rcwn-enabled"]
# In[9]:
# get the bins we use for the histogram for a probe by looking at all branches
def get_bins(probe_name, logscale=False):
all_branches = [r.val for r in df.where(df.probe == probe_name) .collect()]
#remove top 0.5%, bottom 0.5% for easy outlier
trim = int(len(all_branches)/200.0)
all_branches_trimmed = sorted(all_branches)
all_branches_trimmed = all_branches_trimmed[trim:-1*trim]
if logscale:
if all_branches_trimmed[0] < 1:
all_branches_trimmed = py_map(lambda d: d+1, all_branches_trimmed)
return list(np.linspace(np.log10(all_branches_trimmed[1]), np.log10(all_branches_trimmed[-1]), 10))
n,b = np.histogram(all_branches_trimmed,10)
return b
# get values for branch of experiment for pref, and trim off outliers
def get_vals(pref_name, branch):
x_vals = [r.val for r in df.where(df.probe == pref_name) .where(df.branch == branch) .collect()]
trim = int(len(x_vals)/200.0)
x_trimmed = sorted(x_vals)[trim:-1*trim]
return x_trimmed
def median(lst):
return lst[(len(lst))/2]
# return (pval, direction) if significant p value for mannwhitneyu vs control
def test_unequal(branch_vals, control_vals, p_threshold=.05):
try:
r = mannwhitneyu(branch_vals, control_vals)
except:
return None, None
prefix = ""
if r.pvalue < p_threshold:
prefix = "***"
if median(branch_vals) > median(control_vals):
return (prefix + str(r.pvalue), "> control")
return (prefix + str(r.pvalue), "< control")
# return (proceed bool, reason)
def can_chart_pref(pref_name):
n = df.where(df.probe == pref_name).count()
if n==0:
return (False, "0 entries for pref %s"%pref_name)
elif n>10000000:
return (False, "%i values for pref %s"%(n,pref_name))
return (True, None)
# chart histograms for all branches of a probe, log/std, and calculate if any branches vary from the mean
def chart_pref(pref_name, logscale):
sig_branches = []
fig, axarr = plt.subplots(n_branches, 1, sharex=True, sharey= 'col')
b = get_bins(pref_name, logscale)
plt.tight_layout()
print pref_name + " (logscale=" + str(logscale) + ")" + ":"
control_vals = get_vals(pref_name, "control")
for i in range(n_branches):
if branches[i] == "control":
x = control_vals
else:
x = get_vals(pref_name, branches[i])
if logscale: #always assume 0 as lowest val for now
x_trans = py_map(lambda d: d+1, x)
ap,bp,cp = axarr[i].hist(np.log(x_trans), bins=b)
else:
axarr[i].hist(x, bins=b)
axarr[i].set_title(branches[i])
if branches[i] == "control": continue
print "len(branch_vals) = " + str(len(x)) + ", len(control_vals) = " + str(len(control_vals))
if len(x) != 0 | len(control_vals) != 0:
p, direction = test_unequal(x, control_vals)
if p is not None and p.startswith("***"):
print branches[i], p #, direction
else:
print "branch with no values"
continue
plt.show()
for p in probe_names:
can_chart, reason = can_chart_pref(p)
if can_chart:
chart_pref(p, False)
chart_pref(p, True)
else:
print reason
print
# In[17]:
print get_vals('NETWORK_RACE_CACHE_BANDWIDTH_RACE_NETWORK_WIN', "control")
# there is seriously NOTHING there in the control branch for probes starting with N_R_C_
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment