Skip to content

Instantly share code, notes, and snippets.

@ilanasegall ilanasegall/stylo_metrics.ipynb Secret
Last active Aug 23, 2017

Embed
What would you like to do?
stylo_metrics
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# This script is intended for variables not yet available in the [Experiments Viewer](https://moz-experiments-viewer.herokuapp.com/).
# In[1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
from collections import defaultdict as dd
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
import itertools
from moztelemetry.dataset import Dataset
from scipy.stats import mannwhitneyu
py_max = __builtin__.max
py_map = __builtin__.map
get_ipython().magic(u'pylab inline')
# ### Define Experiment Variables
# In[2]:
experiment_probes = {}
experiment_probes["payload/histograms"] = [
"TIME_TO_DOM_INTERACTIVE_MS",
"TIME_TO_DOM_COMPLETE_MS",
"MEMORY_TOTAL",
"MEMORY_UNIQUE",
"MEMORY_VSIZE",
"GHOST_WINDOWS",
]
experiment_probes["payload/processes/content/histograms"] = [
"TIME_TO_DOM_LOADING_MS",
]
# In[3]:
probe_names = list(itertools.chain(*experiment_probes.values()))
# To investigate:
#
# TOTAL_SCROLL_Y always = 0.0
# no entries for IDLE_NOTIFY_BACK_LISTENERS
#
# In[4]:
EXPERIMENT_SLUG = "pref-flip-quantum-css-style-r1-1381147"
START_DATE = "20170725" #YYYYMMDD
END_DATE = "20170823" #YYYYMMDD
# ### Simple Utils
# In[5]:
#modified from https://stackoverflow.com/questions/13852896/using-a-single-variable-to-index-into-nested-dictionaries
def recursive_get(d, keys):
if len(keys) == 1:
return d.get(keys[0],{})
return recursive_get(d.get(keys[0],{}), keys[1:])
# In[6]:
def median(lst):
return lst[(len(lst))/2]
# ### Get data
# In[7]:
cohorts = Dataset.from_source("telemetry-cohorts")
cohorts.schema
# Fetch experiment data
# In[8]:
pings = cohorts.where(submissionDate = lambda x: x >= START_DATE and x <= END_DATE) .where(experimentId=EXPERIMENT_SLUG) .records(sc, sample=.1)
pings.cache()
pings.count()
# What were the different ping types observed for this experiment on this day?
# In[9]:
# pings.map(lambda f: f["meta"].get("docType", "None")).countByValue()
# In[10]:
main_pings = pings.filter(lambda x: x["meta"]["docType"]=="main")
# In[11]:
main_pings.cache()
# #### (testing samples)
# In[12]:
# subset = main_pings.take(1000)
# In[13]:
# subset = sc.parallelize(subset)
# In[14]:
# single = sc.parallelize(main_pings.take(1))
# In[15]:
# single.first()
# ### Utility functions
# In[16]:
def extract_probes(p):
branch = p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")
output = []
for path, probe_names in experiment_probes.iteritems():
for probe_name in probe_names:
probe = recursive_get(p, path.split("/")).get(probe_name, {})
#take all entries in histogram right now. this is probably problematic. inspect more
for k,v in probe.get("values",{}).iteritems():
output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v))
return output
# In[17]:
# get the bins we use for the histogram for a probe by looking at all branches
def get_bins(probe_name, logscale=False):
all_branches = [r.val for r in df.where(df.probe == probe_name) .collect()]
#remove top 0.5%, bottom 0.5% for easy outlier
trim = int(len(all_branches)/200.0)
all_branches_trimmed = sorted(all_branches)
all_branches_trimmed = all_branches_trimmed[trim:-1*trim]
if logscale:
if all_branches_trimmed[0] < 1:
all_branches_trimmed = py_map(lambda d: d+1, all_branches_trimmed)
return list(np.linspace(np.log10(all_branches_trimmed[1]), np.log10(all_branches_trimmed[-1]), 10))
n,b = np.histogram(all_branches_trimmed,10)
return b
# In[27]:
# return (proceed bool, reason)
def can_chart_pref(pref_name):
n = df.where(df.probe == pref_name).count()
if n==0:
return (False, "0 entries for pref %s"%pref_name)
elif n>10000000:
return (False, "%i values for pref %s"%(n,pref_name))
return (True, None)
# In[28]:
# get values for branch of experiment for pref, and trim off outliers
def get_vals(pref_name, branch):
x_vals = [r.val for r in df.where(df.probe == pref_name) .where(df.branch == branch) .collect()]
trim = int(len(x_vals)/200.0)
x_trimmed = sorted(x_vals)[trim:-1*trim]
return x_trimmed
# In[29]:
# return (pval, direction) if significant p value for mannwhitneyu vs control
def test_unequal(branch_vals, control_vals, p_threshold=.05):
try:
r = mannwhitneyu(branch_vals, control_vals)
except:
return None, None
prefix = ""
if r.pvalue < p_threshold:
prefix = "***"
if median(branch_vals) > median(control_vals):
return (prefix + str(r.pvalue), "> control")
return (prefix + str(r.pvalue), "< control")
# In[43]:
# chart histograms for all branches of a probe, log/std, and calculate if any branches vary from the mean
def chart_pref(pref_name, logscale):
sig_branches = []
fig, axarr = plt.subplots(n_branches, 1, sharex=True)
b = get_bins(pref_name, logscale)
plt.tight_layout()
print pref_name + " (logscale=" + str(logscale) + ")" + ":"
control_vals = get_vals(pref_name, "gecko")
for i in range(n_branches):
if branches[i] == "control":
x = control_vals
else:
x = get_vals(pref_name, branches[i])
if logscale: #always assume 0 as lowest val for now
x_trans = py_map(lambda d: d+1, x)
ap,bp,cp = axarr[i].hist(np.log(x_trans), bins=b)
else:
axarr[i].hist(x, bins=b)
axarr[i].set_title(branches[i])
if branches[i] == "gecko": continue
p, direction = test_unequal(x, control_vals)
if p is not None and p.startswith("***"):
print branches[i], p #, direction
plt.show()
# # Results
# In[44]:
probe_dicts = main_pings.flatMap(extract_probes)
# In[45]:
df = sqlContext.createDataFrame(probe_dicts.map(lambda d: Row(**d)))
# In[46]:
#check on branch distribution
for b,v in main_pings.map(lambda x: x.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")).countByValue().iteritems():
print b, ":", v
# In[47]:
branches = [i.branch for i in df.select("branch").distinct().collect()]
n_branches = len(branches)
#for this one - set order
branches = ["gecko", "stylo"]
# In[48]:
for p in probe_names:
can_chart, reason = can_chart_pref(p)
if can_chart:
chart_pref(p, False)
chart_pref(p, True)
else:
print reason
print
# In[ ]:
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.