jtg567/Race Cache 2 via ilana workflow.ipynb Secret

## Race Cache 2 via ilana workflow.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Race Cache 2 via ilana workflow.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Race Cache 2 via ilana workflow.py

# coding: utf-8

# This script is intended for variables not yet available in the [Experiments Viewer](https://moz-experiments-viewer.herokuapp.com/).

# In[1]:

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
from collections import defaultdict as dd

from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
import itertools

from moztelemetry.dataset import Dataset

from scipy.stats import mannwhitneyu

py_max = __builtin__.max
py_map = __builtin__.map

get_ipython().magic(u'pylab inline')


# In[2]:

experiment_probes = {}
experiment_probes["payload/histograms"] =  [
                "NETWORK_RACE_CACHE_WITH_NETWORK_OCEC_ON_START_DIFF",
                "NETWORK_RACE_CACHE_WITH_NETWORK_SAVED_TIME",
                "NETWORK_RACE_CACHE_WITH_NETWORK_USAGE_2",
                "NETWORK_RACE_CACHE_BANDWIDTH_RACE_NETWORK_WIN",
                "NETWORK_RACE_CACHE_BANDWIDTH_RACE_CACHE_WIN",
                "NETWORK_RACE_CACHE_BANDWIDTH_NOT_RACE",
                "NETWORK_RACE_CACHE_VALIDATION",
                "HTTP_PAGE_COMPLETE_LOAD_V2",
                "HTTP_PAGE_COMPLETE_LOAD_NET_V2",
                "HTTP_PAGE_COMPLETE_LOAD_CACHED_V2",
                "HTTP_SUB_COMPLETE_LOAD_V2",
                "HTTP_SUB_COMPLETE_LOAD_CACHED_V2",
                "HTTP_SUB_COMPLETE_LOAD_NET_V2",
                "TOTAL_CONTENT_PAGE_LOAD_TIME",
                ]

probe_names = list(itertools.chain(*experiment_probes.values()))

EXPERIMENT_SLUG = "pref-flip-rcwn2-1381816"
START_DATE = "20170726" #YYYYMMDD
END_DATE = "20170802" #YYYYMMDD


# In[3]:

cohorts = Dataset.from_source("telemetry-cohorts")
pings = cohorts.where(submissionDate = lambda x: x >= START_DATE and x <= END_DATE)                .where(experimentId=EXPERIMENT_SLUG)                .records(sc, sample=0.01)

#pings.cache()
#pings.count()


# In[4]:

main_pings = pings.filter(lambda x: x["meta"]["docType"]=="main")
main_pings.cache()


# In[ ]:

# haven't run this yet - trying to aggregate histograms per Client instead of collapsing over them

byClient = main_pings.map(lambda p: (p['clientId'], [p])).reduceByKey(lambda x,y: x+y)

#modified from https://stackoverflow.com/questions/13852896/using-a-single-variable-to-index-into-nested-dictionaries
def recursive_get(d, keys):
    if len(keys) == 1:
        return d.get(keys[0],{})
    return recursive_get(d.get(keys[0],{}), keys[1:])

def extract_probes(i):
    clientId, pinglist = i

    # determine which branch this client was in and note if it ended up in both
    branch = []
    for p in pinglist:
        branch.extend(p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning"))
    if len(np.unique(branch)) > 1:
        branch = "both"
    else:
        branch = np.unique(branch)[0]

    # if you input by client, pinglist instead of ping add another layer of iteration over the list here
    output = []
    for p in pinglist:
        for path, probe_names in experiment_probes.iteritems():
            for probe_name in probe_names:
                probe = recursive_get(p, path.split("/")).get(probe_name, {})
                for k,v in probe.get("values",{}).iteritems():
                    output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v))
    # by this point you should have all this clients pings in an aggregate histogram, right? what then?


#    return output

fin = byClient.flatMap(extract_probes)


# In[5]:

#modified from https://stackoverflow.com/questions/13852896/using-a-single-variable-to-index-into-nested-dictionaries
def recursive_get(d, keys):
    if len(keys) == 1:
        return d.get(keys[0],{})
    return recursive_get(d.get(keys[0],{}), keys[1:])

def extract_probes(p):
    branch = p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")

    output = []
    for path, probe_names in experiment_probes.iteritems():
        for probe_name in probe_names:
            probe = recursive_get(p, path.split("/")).get(probe_name, {})
            #take all entries in histogram right now. this is probably problematic. inspect more
            for k,v in probe.get("values",{}).iteritems():
                output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v))
    return output

probe_dicts = main_pings.flatMap(extract_probes)


# In[6]:

df = sqlContext.createDataFrame(probe_dicts.map(lambda d: Row(**d)))


# In[7]:

#check on branch distribution
for b,v in main_pings.map(lambda x: x.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")).countByValue().iteritems():
    print b, ":", v


# In[8]:

branches = [i.branch for i in df.select("branch").distinct().collect()]
n_branches = len(branches)

#for this one - set order
branches = ["control", "rcwn-enabled"]


# In[9]:

# get the bins we use for the histogram for a probe by looking at all branches
def get_bins(probe_name, logscale=False):

    all_branches = [r.val for r in df.where(df.probe == probe_name)                                     .collect()]

    #remove top 0.5%, bottom 0.5% for easy outlier
    trim = int(len(all_branches)/200.0)
    all_branches_trimmed = sorted(all_branches)
    all_branches_trimmed = all_branches_trimmed[trim:-1*trim]

    if logscale:
        if all_branches_trimmed[0] < 1:
            all_branches_trimmed = py_map(lambda d: d+1, all_branches_trimmed)
        return list(np.linspace(np.log10(all_branches_trimmed[1]), np.log10(all_branches_trimmed[-1]), 10))

    n,b = np.histogram(all_branches_trimmed,10)
    return b

# get values for branch of experiment for pref, and trim off outliers
def get_vals(pref_name, branch):
    x_vals = [r.val for r in df.where(df.probe == pref_name)                               .where(df.branch == branch)                               .collect()]
    trim = int(len(x_vals)/200.0)
    x_trimmed = sorted(x_vals)[trim:-1*trim]
    return x_trimmed

def median(lst):
    return lst[(len(lst))/2]

# return (pval, direction) if significant p value for mannwhitneyu vs control
def test_unequal(branch_vals, control_vals, p_threshold=.05):

    try:
        r = mannwhitneyu(branch_vals, control_vals)
    except:
        return None, None

    prefix = ""
    if r.pvalue < p_threshold:
        prefix = "***"

    if median(branch_vals) > median(control_vals):
        return (prefix + str(r.pvalue), "> control")

    return (prefix + str(r.pvalue), "< control")

# return (proceed bool, reason)
def can_chart_pref(pref_name):
    n = df.where(df.probe == pref_name).count()
    if n==0:
        return (False, "0 entries for pref %s"%pref_name)
    elif n>10000000:
        return (False, "%i values for pref %s"%(n,pref_name))
    return (True, None)

# chart histograms for all branches of a probe, log/std, and calculate if any branches vary from the mean
def chart_pref(pref_name, logscale):

    sig_branches = []
    fig, axarr = plt.subplots(n_branches, 1, sharex=True, sharey= 'col')

    b = get_bins(pref_name, logscale)
    plt.tight_layout()

    print pref_name + " (logscale=" + str(logscale) + ")" + ":"

    control_vals = get_vals(pref_name, "control")

    for i in range(n_branches):

        if branches[i] == "control":
            x = control_vals

        else:
            x = get_vals(pref_name, branches[i])

        if logscale: #always assume 0 as lowest val for now
            x_trans = py_map(lambda d: d+1, x)
            ap,bp,cp = axarr[i].hist(np.log(x_trans), bins=b)

        else:
            axarr[i].hist(x, bins=b)

        axarr[i].set_title(branches[i])

        if branches[i] == "control": continue

        print "len(branch_vals) = " + str(len(x)) + ", len(control_vals) = " + str(len(control_vals))
        if len(x) != 0 | len(control_vals) != 0:
            p, direction = test_unequal(x, control_vals)
            if p is not None and p.startswith("***"):
                print branches[i], p #, direction
        else:
            print "branch with no values"
            continue

    plt.show()

for p in probe_names:
    can_chart, reason = can_chart_pref(p)
    if can_chart:
        chart_pref(p, False)
        chart_pref(p, True)
    else:
        print reason
    print


# In[17]:

print get_vals('NETWORK_RACE_CACHE_BANDWIDTH_RACE_NETWORK_WIN', "control")
# there is seriously NOTHING there in the control branch for probes starting with N_R_C_

	# coding: utf-8

	# This script is intended for variables not yet available in the [Experiments Viewer](https://moz-experiments-viewer.herokuapp.com/).

	# In[1]:

	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import plotly.plotly as py
	from scipy.stats import chi2_contingency
	from scipy.stats import ttest_ind
	from collections import defaultdict as dd

	from pyspark.sql import Row
	from pyspark.sql import SQLContext
	from pyspark.sql.types import *
	from pyspark.sql.functions import *
	import itertools

	from moztelemetry.dataset import Dataset

	from scipy.stats import mannwhitneyu

	py_max = __builtin__.max
	py_map = __builtin__.map

	get_ipython().magic(u'pylab inline')


	# In[2]:

	experiment_probes = {}
	experiment_probes["payload/histograms"] = [
	"NETWORK_RACE_CACHE_WITH_NETWORK_OCEC_ON_START_DIFF",
	"NETWORK_RACE_CACHE_WITH_NETWORK_SAVED_TIME",
	"NETWORK_RACE_CACHE_WITH_NETWORK_USAGE_2",
	"NETWORK_RACE_CACHE_BANDWIDTH_RACE_NETWORK_WIN",
	"NETWORK_RACE_CACHE_BANDWIDTH_RACE_CACHE_WIN",
	"NETWORK_RACE_CACHE_BANDWIDTH_NOT_RACE",
	"NETWORK_RACE_CACHE_VALIDATION",
	"HTTP_PAGE_COMPLETE_LOAD_V2",
	"HTTP_PAGE_COMPLETE_LOAD_NET_V2",
	"HTTP_PAGE_COMPLETE_LOAD_CACHED_V2",
	"HTTP_SUB_COMPLETE_LOAD_V2",
	"HTTP_SUB_COMPLETE_LOAD_CACHED_V2",
	"HTTP_SUB_COMPLETE_LOAD_NET_V2",
	"TOTAL_CONTENT_PAGE_LOAD_TIME",
	]

	probe_names = list(itertools.chain(*experiment_probes.values()))

	EXPERIMENT_SLUG = "pref-flip-rcwn2-1381816"
	START_DATE = "20170726" #YYYYMMDD
	END_DATE = "20170802" #YYYYMMDD


	# In[3]:

	cohorts = Dataset.from_source("telemetry-cohorts")
	pings = cohorts.where(submissionDate = lambda x: x >= START_DATE and x <= END_DATE) .where(experimentId=EXPERIMENT_SLUG) .records(sc, sample=0.01)

	#pings.cache()
	#pings.count()


	# In[4]:

	main_pings = pings.filter(lambda x: x["meta"]["docType"]=="main")
	main_pings.cache()


	# In[ ]:

	# haven't run this yet - trying to aggregate histograms per Client instead of collapsing over them

	byClient = main_pings.map(lambda p: (p['clientId'], [p])).reduceByKey(lambda x,y: x+y)

	#modified from https://stackoverflow.com/questions/13852896/using-a-single-variable-to-index-into-nested-dictionaries
	def recursive_get(d, keys):
	if len(keys) == 1:
	return d.get(keys[0],{})
	return recursive_get(d.get(keys[0],{}), keys[1:])

	def extract_probes(i):
	clientId, pinglist = i

	# determine which branch this client was in and note if it ended up in both
	branch = []
	for p in pinglist:
	branch.extend(p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning"))
	if len(np.unique(branch)) > 1:
	branch = "both"
	else:
	branch = np.unique(branch)[0]

	# if you input by client, pinglist instead of ping add another layer of iteration over the list here
	output = []
	for p in pinglist:
	for path, probe_names in experiment_probes.iteritems():
	for probe_name in probe_names:
	probe = recursive_get(p, path.split("/")).get(probe_name, {})
	for k,v in probe.get("values",{}).iteritems():
	output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v))
	# by this point you should have all this clients pings in an aggregate histogram, right? what then?


	# return output

	fin = byClient.flatMap(extract_probes)


	# In[5]:

	#modified from https://stackoverflow.com/questions/13852896/using-a-single-variable-to-index-into-nested-dictionaries
	def recursive_get(d, keys):
	if len(keys) == 1:
	return d.get(keys[0],{})
	return recursive_get(d.get(keys[0],{}), keys[1:])

	def extract_probes(p):
	branch = p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")

	output = []
	for path, probe_names in experiment_probes.iteritems():
	for probe_name in probe_names:
	probe = recursive_get(p, path.split("/")).get(probe_name, {})
	#take all entries in histogram right now. this is probably problematic. inspect more
	for k,v in probe.get("values",{}).iteritems():
	output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v))
	return output

	probe_dicts = main_pings.flatMap(extract_probes)


	# In[6]:

	df = sqlContext.createDataFrame(probe_dicts.map(lambda d: Row(**d)))


	# In[7]:

	#check on branch distribution
	for b,v in main_pings.map(lambda x: x.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")).countByValue().iteritems():
	print b, ":", v


	# In[8]:

	branches = [i.branch for i in df.select("branch").distinct().collect()]
	n_branches = len(branches)

	#for this one - set order
	branches = ["control", "rcwn-enabled"]


	# In[9]:

	# get the bins we use for the histogram for a probe by looking at all branches
	def get_bins(probe_name, logscale=False):

	all_branches = [r.val for r in df.where(df.probe == probe_name) .collect()]

	#remove top 0.5%, bottom 0.5% for easy outlier
	trim = int(len(all_branches)/200.0)
	all_branches_trimmed = sorted(all_branches)
	all_branches_trimmed = all_branches_trimmed[trim:-1*trim]

	if logscale:
	if all_branches_trimmed[0] < 1:
	all_branches_trimmed = py_map(lambda d: d+1, all_branches_trimmed)
	return list(np.linspace(np.log10(all_branches_trimmed[1]), np.log10(all_branches_trimmed[-1]), 10))

	n,b = np.histogram(all_branches_trimmed,10)
	return b

	# get values for branch of experiment for pref, and trim off outliers
	def get_vals(pref_name, branch):
	x_vals = [r.val for r in df.where(df.probe == pref_name) .where(df.branch == branch) .collect()]
	trim = int(len(x_vals)/200.0)
	x_trimmed = sorted(x_vals)[trim:-1*trim]
	return x_trimmed

	def median(lst):
	return lst[(len(lst))/2]

	# return (pval, direction) if significant p value for mannwhitneyu vs control
	def test_unequal(branch_vals, control_vals, p_threshold=.05):

	try:
	r = mannwhitneyu(branch_vals, control_vals)
	except:
	return None, None

	prefix = ""
	if r.pvalue < p_threshold:
	prefix = "***"

	if median(branch_vals) > median(control_vals):
	return (prefix + str(r.pvalue), "> control")

	return (prefix + str(r.pvalue), "< control")

	# return (proceed bool, reason)
	def can_chart_pref(pref_name):
	n = df.where(df.probe == pref_name).count()
	if n==0:
	return (False, "0 entries for pref %s"%pref_name)
	elif n>10000000:
	return (False, "%i values for pref %s"%(n,pref_name))
	return (True, None)

	# chart histograms for all branches of a probe, log/std, and calculate if any branches vary from the mean
	def chart_pref(pref_name, logscale):

	sig_branches = []
	fig, axarr = plt.subplots(n_branches, 1, sharex=True, sharey= 'col')

	b = get_bins(pref_name, logscale)
	plt.tight_layout()

	print pref_name + " (logscale=" + str(logscale) + ")" + ":"

	control_vals = get_vals(pref_name, "control")

	for i in range(n_branches):

	if branches[i] == "control":
	x = control_vals

	else:
	x = get_vals(pref_name, branches[i])

	if logscale: #always assume 0 as lowest val for now
	x_trans = py_map(lambda d: d+1, x)
	ap,bp,cp = axarr[i].hist(np.log(x_trans), bins=b)

	else:
	axarr[i].hist(x, bins=b)

	axarr[i].set_title(branches[i])

	if branches[i] == "control": continue

	print "len(branch_vals) = " + str(len(x)) + ", len(control_vals) = " + str(len(control_vals))
	if len(x) != 0 \| len(control_vals) != 0:
	p, direction = test_unequal(x, control_vals)
	if p is not None and p.startswith("***"):
	print branches[i], p #, direction
	else:
	print "branch with no values"
	continue

	plt.show()

	for p in probe_names:
	can_chart, reason = can_chart_pref(p)
	if can_chart:
	chart_pref(p, False)
	chart_pref(p, True)
	else:
	print reason
	print


	# In[17]:

	print get_vals('NETWORK_RACE_CACHE_BANDWIDTH_RACE_NETWORK_WIN', "control")
	# there is seriously NOTHING there in the control branch for probes starting with N_R_C_