Skip to content

Instantly share code, notes, and snippets.

@ilanasegall
Last active August 23, 2017 20:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ilanasegall/a422dcdbaec8b0c44b984567a9a04a42 to your computer and use it in GitHub Desktop.
Save ilanasegall/a422dcdbaec8b0c44b984567a9a04a42 to your computer and use it in GitHub Desktop.
stylo_metrics
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# This script is intended for variables not yet available in the [Experiments Viewer](https://moz-experiments-viewer.herokuapp.com/).
# In[1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
from collections import defaultdict as dd
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
import itertools
from moztelemetry.dataset import Dataset
from scipy.stats import mannwhitneyu
py_max = __builtin__.max
py_map = __builtin__.map
get_ipython().magic(u'pylab inline')
# ### Define Experiment Variables
# In[2]:
experiment_probes = {}
experiment_probes["payload/histograms"] = [
"TIME_TO_DOM_INTERACTIVE_MS",
"TIME_TO_DOM_COMPLETE_MS",
"MEMORY_TOTAL",
"MEMORY_UNIQUE",
"MEMORY_VSIZE",
"GHOST_WINDOWS",
]
experiment_probes["payload/processes/content/histograms"] = [
"TIME_TO_DOM_LOADING_MS",
]
# In[3]:
probe_names = list(itertools.chain(*experiment_probes.values()))
# To investigate:
#
# TOTAL_SCROLL_Y always = 0.0
# no entries for IDLE_NOTIFY_BACK_LISTENERS
#
# In[4]:
EXPERIMENT_SLUG = "pref-flip-quantum-css-style-r1-1381147"
START_DATE = "20170725" #YYYYMMDD
END_DATE = "20170823" #YYYYMMDD
# ### Simple Utils
# In[5]:
#modified from https://stackoverflow.com/questions/13852896/using-a-single-variable-to-index-into-nested-dictionaries
def recursive_get(d, keys):
if len(keys) == 1:
return d.get(keys[0],{})
return recursive_get(d.get(keys[0],{}), keys[1:])
# In[6]:
def median(lst):
return lst[(len(lst))/2]
# ### Get data
# In[7]:
cohorts = Dataset.from_source("telemetry-cohorts")
cohorts.schema
# Fetch experiment data
# In[8]:
pings = cohorts.where(submissionDate = lambda x: x >= START_DATE and x <= END_DATE) .where(experimentId=EXPERIMENT_SLUG) .records(sc, sample=.1)
pings.cache()
pings.count()
# What were the different ping types observed for this experiment on this day?
# In[9]:
# pings.map(lambda f: f["meta"].get("docType", "None")).countByValue()
# In[10]:
main_pings = pings.filter(lambda x: x["meta"]["docType"]=="main")
# In[11]:
main_pings.cache()
# #### (testing samples)
# In[12]:
# subset = main_pings.take(1000)
# In[13]:
# subset = sc.parallelize(subset)
# In[14]:
# single = sc.parallelize(main_pings.take(1))
# In[15]:
# single.first()
# ### Utility functions
# In[16]:
def extract_probes(p):
branch = p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")
output = []
for path, probe_names in experiment_probes.iteritems():
for probe_name in probe_names:
probe = recursive_get(p, path.split("/")).get(probe_name, {})
#take all entries in histogram right now. this is probably problematic. inspect more
for k,v in probe.get("values",{}).iteritems():
output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v))
return output
# In[17]:
# get the bins we use for the histogram for a probe by looking at all branches
def get_bins(probe_name, logscale=False):
all_branches = [r.val for r in df.where(df.probe == probe_name) .collect()]
#remove top 0.5%, bottom 0.5% for easy outlier
trim = int(len(all_branches)/200.0)
all_branches_trimmed = sorted(all_branches)
all_branches_trimmed = all_branches_trimmed[trim:-1*trim]
if logscale:
if all_branches_trimmed[0] < 1:
all_branches_trimmed = py_map(lambda d: d+1, all_branches_trimmed)
return list(np.linspace(np.log10(all_branches_trimmed[1]), np.log10(all_branches_trimmed[-1]), 10))
n,b = np.histogram(all_branches_trimmed,10)
return b
# In[27]:
# return (proceed bool, reason)
def can_chart_pref(pref_name):
n = df.where(df.probe == pref_name).count()
if n==0:
return (False, "0 entries for pref %s"%pref_name)
elif n>10000000:
return (False, "%i values for pref %s"%(n,pref_name))
return (True, None)
# In[28]:
# get values for branch of experiment for pref, and trim off outliers
def get_vals(pref_name, branch):
x_vals = [r.val for r in df.where(df.probe == pref_name) .where(df.branch == branch) .collect()]
trim = int(len(x_vals)/200.0)
x_trimmed = sorted(x_vals)[trim:-1*trim]
return x_trimmed
# In[29]:
# return (pval, direction) if significant p value for mannwhitneyu vs control
def test_unequal(branch_vals, control_vals, p_threshold=.05):
try:
r = mannwhitneyu(branch_vals, control_vals)
except:
return None, None
prefix = ""
if r.pvalue < p_threshold:
prefix = "***"
if median(branch_vals) > median(control_vals):
return (prefix + str(r.pvalue), "> control")
return (prefix + str(r.pvalue), "< control")
# In[43]:
# chart histograms for all branches of a probe, log/std, and calculate if any branches vary from the mean
def chart_pref(pref_name, logscale):
sig_branches = []
fig, axarr = plt.subplots(n_branches, 1, sharex=True)
b = get_bins(pref_name, logscale)
plt.tight_layout()
print pref_name + " (logscale=" + str(logscale) + ")" + ":"
control_vals = get_vals(pref_name, "gecko")
for i in range(n_branches):
if branches[i] == "control":
x = control_vals
else:
x = get_vals(pref_name, branches[i])
if logscale: #always assume 0 as lowest val for now
x_trans = py_map(lambda d: d+1, x)
ap,bp,cp = axarr[i].hist(np.log(x_trans), bins=b)
else:
axarr[i].hist(x, bins=b)
axarr[i].set_title(branches[i])
if branches[i] == "gecko": continue
p, direction = test_unequal(x, control_vals)
if p is not None and p.startswith("***"):
print branches[i], p #, direction
plt.show()
# # Results
# In[44]:
probe_dicts = main_pings.flatMap(extract_probes)
# In[45]:
df = sqlContext.createDataFrame(probe_dicts.map(lambda d: Row(**d)))
# In[46]:
#check on branch distribution
for b,v in main_pings.map(lambda x: x.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")).countByValue().iteritems():
print b, ":", v
# In[47]:
branches = [i.branch for i in df.select("branch").distinct().collect()]
n_branches = len(branches)
#for this one - set order
branches = ["gecko", "stylo"]
# In[48]:
for p in probe_names:
can_chart, reason = can_chart_pref(p)
if can_chart:
chart_pref(p, False)
chart_pref(p, True)
else:
print reason
print
# In[ ]:
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment