Last active
August 28, 2017 20:06
-
-
Save jtg567/3a69308c67724484369fc3650012b4a3 to your computer and use it in GitHub Desktop.
Race Cache 2 via ilana workflow
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# This script is intended for variables not yet available in the [Experiments Viewer](https://moz-experiments-viewer.herokuapp.com/). | |
# In[1]: | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
from scipy.stats import chi2_contingency | |
from scipy.stats import ttest_ind | |
from collections import defaultdict as dd | |
from pyspark.sql import Row | |
from pyspark.sql import SQLContext | |
from pyspark.sql.types import * | |
from pyspark.sql.functions import * | |
import itertools | |
from moztelemetry.dataset import Dataset | |
from scipy.stats import mannwhitneyu | |
py_max = __builtin__.max | |
py_map = __builtin__.map | |
get_ipython().magic(u'pylab inline') | |
# In[2]: | |
experiment_probes = {} | |
experiment_probes["payload/histograms"] = [ | |
"NETWORK_RACE_CACHE_WITH_NETWORK_OCEC_ON_START_DIFF", | |
"NETWORK_RACE_CACHE_WITH_NETWORK_SAVED_TIME", | |
"NETWORK_RACE_CACHE_WITH_NETWORK_USAGE_2", | |
"NETWORK_RACE_CACHE_BANDWIDTH_RACE_NETWORK_WIN", | |
"NETWORK_RACE_CACHE_BANDWIDTH_RACE_CACHE_WIN", | |
"NETWORK_RACE_CACHE_BANDWIDTH_NOT_RACE", | |
"NETWORK_RACE_CACHE_VALIDATION", | |
"HTTP_PAGE_COMPLETE_LOAD_V2", | |
"HTTP_PAGE_COMPLETE_LOAD_NET_V2", | |
"HTTP_PAGE_COMPLETE_LOAD_CACHED_V2", | |
"HTTP_SUB_COMPLETE_LOAD_V2", | |
"HTTP_SUB_COMPLETE_LOAD_CACHED_V2", | |
"HTTP_SUB_COMPLETE_LOAD_NET_V2", | |
"TOTAL_CONTENT_PAGE_LOAD_TIME", | |
] | |
probe_names = list(itertools.chain(*experiment_probes.values())) | |
EXPERIMENT_SLUG = "pref-flip-rcwn2-1381816" | |
START_DATE = "20170726" #YYYYMMDD | |
END_DATE = "20170802" #YYYYMMDD | |
# In[3]: | |
cohorts = Dataset.from_source("telemetry-cohorts") | |
pings = cohorts.where(submissionDate = lambda x: x >= START_DATE and x <= END_DATE) .where(experimentId=EXPERIMENT_SLUG) .records(sc, sample=0.01) | |
#pings.cache() | |
#pings.count() | |
# In[4]: | |
main_pings = pings.filter(lambda x: x["meta"]["docType"]=="main") | |
main_pings.cache() | |
# In[ ]: | |
# haven't run this yet - trying to aggregate histograms per Client instead of collapsing over them | |
byClient = main_pings.map(lambda p: (p['clientId'], [p])).reduceByKey(lambda x,y: x+y) | |
#modified from https://stackoverflow.com/questions/13852896/using-a-single-variable-to-index-into-nested-dictionaries | |
def recursive_get(d, keys): | |
if len(keys) == 1: | |
return d.get(keys[0],{}) | |
return recursive_get(d.get(keys[0],{}), keys[1:]) | |
def extract_probes(i): | |
clientId, pinglist = i | |
# determine which branch this client was in and note if it ended up in both | |
branch = [] | |
for p in pinglist: | |
branch.extend(p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")) | |
if len(np.unique(branch)) > 1: | |
branch = "both" | |
else: | |
branch = np.unique(branch)[0] | |
# if you input by client, pinglist instead of ping add another layer of iteration over the list here | |
output = [] | |
for p in pinglist: | |
for path, probe_names in experiment_probes.iteritems(): | |
for probe_name in probe_names: | |
probe = recursive_get(p, path.split("/")).get(probe_name, {}) | |
for k,v in probe.get("values",{}).iteritems(): | |
output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v)) | |
# by this point you should have all this clients pings in an aggregate histogram, right? what then? | |
# return output | |
fin = byClient.flatMap(extract_probes) | |
# In[5]: | |
#modified from https://stackoverflow.com/questions/13852896/using-a-single-variable-to-index-into-nested-dictionaries | |
def recursive_get(d, keys): | |
if len(keys) == 1: | |
return d.get(keys[0],{}) | |
return recursive_get(d.get(keys[0],{}), keys[1:]) | |
def extract_probes(p): | |
branch = p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning") | |
output = [] | |
for path, probe_names in experiment_probes.iteritems(): | |
for probe_name in probe_names: | |
probe = recursive_get(p, path.split("/")).get(probe_name, {}) | |
#take all entries in histogram right now. this is probably problematic. inspect more | |
for k,v in probe.get("values",{}).iteritems(): | |
output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v)) | |
return output | |
probe_dicts = main_pings.flatMap(extract_probes) | |
# In[6]: | |
df = sqlContext.createDataFrame(probe_dicts.map(lambda d: Row(**d))) | |
# In[7]: | |
#check on branch distribution | |
for b,v in main_pings.map(lambda x: x.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")).countByValue().iteritems(): | |
print b, ":", v | |
# In[8]: | |
branches = [i.branch for i in df.select("branch").distinct().collect()] | |
n_branches = len(branches) | |
#for this one - set order | |
branches = ["control", "rcwn-enabled"] | |
# In[9]: | |
# get the bins we use for the histogram for a probe by looking at all branches | |
def get_bins(probe_name, logscale=False): | |
all_branches = [r.val for r in df.where(df.probe == probe_name) .collect()] | |
#remove top 0.5%, bottom 0.5% for easy outlier | |
trim = int(len(all_branches)/200.0) | |
all_branches_trimmed = sorted(all_branches) | |
all_branches_trimmed = all_branches_trimmed[trim:-1*trim] | |
if logscale: | |
if all_branches_trimmed[0] < 1: | |
all_branches_trimmed = py_map(lambda d: d+1, all_branches_trimmed) | |
return list(np.linspace(np.log10(all_branches_trimmed[1]), np.log10(all_branches_trimmed[-1]), 10)) | |
n,b = np.histogram(all_branches_trimmed,10) | |
return b | |
# get values for branch of experiment for pref, and trim off outliers | |
def get_vals(pref_name, branch): | |
x_vals = [r.val for r in df.where(df.probe == pref_name) .where(df.branch == branch) .collect()] | |
trim = int(len(x_vals)/200.0) | |
x_trimmed = sorted(x_vals)[trim:-1*trim] | |
return x_trimmed | |
def median(lst): | |
return lst[(len(lst))/2] | |
# return (pval, direction) if significant p value for mannwhitneyu vs control | |
def test_unequal(branch_vals, control_vals, p_threshold=.05): | |
try: | |
r = mannwhitneyu(branch_vals, control_vals) | |
except: | |
return None, None | |
prefix = "" | |
if r.pvalue < p_threshold: | |
prefix = "***" | |
if median(branch_vals) > median(control_vals): | |
return (prefix + str(r.pvalue), "> control") | |
return (prefix + str(r.pvalue), "< control") | |
# return (proceed bool, reason) | |
def can_chart_pref(pref_name): | |
n = df.where(df.probe == pref_name).count() | |
if n==0: | |
return (False, "0 entries for pref %s"%pref_name) | |
elif n>10000000: | |
return (False, "%i values for pref %s"%(n,pref_name)) | |
return (True, None) | |
# chart histograms for all branches of a probe, log/std, and calculate if any branches vary from the mean | |
def chart_pref(pref_name, logscale): | |
sig_branches = [] | |
fig, axarr = plt.subplots(n_branches, 1, sharex=True, sharey= 'col') | |
b = get_bins(pref_name, logscale) | |
plt.tight_layout() | |
print pref_name + " (logscale=" + str(logscale) + ")" + ":" | |
control_vals = get_vals(pref_name, "control") | |
for i in range(n_branches): | |
if branches[i] == "control": | |
x = control_vals | |
else: | |
x = get_vals(pref_name, branches[i]) | |
if logscale: #always assume 0 as lowest val for now | |
x_trans = py_map(lambda d: d+1, x) | |
ap,bp,cp = axarr[i].hist(np.log(x_trans), bins=b) | |
else: | |
axarr[i].hist(x, bins=b) | |
axarr[i].set_title(branches[i]) | |
if branches[i] == "control": continue | |
print "len(branch_vals) = " + str(len(x)) + ", len(control_vals) = " + str(len(control_vals)) | |
if len(x) != 0 | len(control_vals) != 0: | |
p, direction = test_unequal(x, control_vals) | |
if p is not None and p.startswith("***"): | |
print branches[i], p #, direction | |
else: | |
print "branch with no values" | |
continue | |
plt.show() | |
for p in probe_names: | |
can_chart, reason = can_chart_pref(p) | |
if can_chart: | |
chart_pref(p, False) | |
chart_pref(p, True) | |
else: | |
print reason | |
# In[17]: | |
print get_vals('NETWORK_RACE_CACHE_BANDWIDTH_RACE_NETWORK_WIN', "control") | |
# there is seriously NOTHING there in the control branch for probes starting with N_R_C_ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment