-
-
Save ilanasegall/a422dcdbaec8b0c44b984567a9a04a42 to your computer and use it in GitHub Desktop.
stylo_metrics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# This script is intended for variables not yet available in the [Experiments Viewer](https://moz-experiments-viewer.herokuapp.com/). | |
# In[1]: | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
from scipy.stats import chi2_contingency | |
from scipy.stats import ttest_ind | |
from collections import defaultdict as dd | |
from pyspark.sql import Row | |
from pyspark.sql import SQLContext | |
from pyspark.sql.types import * | |
from pyspark.sql.functions import * | |
import itertools | |
from moztelemetry.dataset import Dataset | |
from scipy.stats import mannwhitneyu | |
py_max = __builtin__.max | |
py_map = __builtin__.map | |
get_ipython().magic(u'pylab inline') | |
# ### Define Experiment Variables | |
# In[2]: | |
experiment_probes = {} | |
experiment_probes["payload/histograms"] = [ | |
"TIME_TO_DOM_INTERACTIVE_MS", | |
"TIME_TO_DOM_COMPLETE_MS", | |
"MEMORY_TOTAL", | |
"MEMORY_UNIQUE", | |
"MEMORY_VSIZE", | |
"GHOST_WINDOWS", | |
] | |
experiment_probes["payload/processes/content/histograms"] = [ | |
"TIME_TO_DOM_LOADING_MS", | |
] | |
# In[3]: | |
probe_names = list(itertools.chain(*experiment_probes.values())) | |
# To investigate: | |
# | |
# TOTAL_SCROLL_Y always = 0.0 | |
# no entries for IDLE_NOTIFY_BACK_LISTENERS | |
# | |
# In[4]: | |
EXPERIMENT_SLUG = "pref-flip-quantum-css-style-r1-1381147" | |
START_DATE = "20170725" #YYYYMMDD | |
END_DATE = "20170823" #YYYYMMDD | |
# ### Simple Utils | |
# In[5]: | |
#modified from https://stackoverflow.com/questions/13852896/using-a-single-variable-to-index-into-nested-dictionaries | |
def recursive_get(d, keys): | |
if len(keys) == 1: | |
return d.get(keys[0],{}) | |
return recursive_get(d.get(keys[0],{}), keys[1:]) | |
# In[6]: | |
def median(lst): | |
return lst[(len(lst))/2] | |
# ### Get data | |
# In[7]: | |
cohorts = Dataset.from_source("telemetry-cohorts") | |
cohorts.schema | |
# Fetch experiment data | |
# In[8]: | |
pings = cohorts.where(submissionDate = lambda x: x >= START_DATE and x <= END_DATE) .where(experimentId=EXPERIMENT_SLUG) .records(sc, sample=.1) | |
pings.cache() | |
pings.count() | |
# What were the different ping types observed for this experiment on this day? | |
# In[9]: | |
# pings.map(lambda f: f["meta"].get("docType", "None")).countByValue() | |
# In[10]: | |
main_pings = pings.filter(lambda x: x["meta"]["docType"]=="main") | |
# In[11]: | |
main_pings.cache() | |
# #### (testing samples) | |
# In[12]: | |
# subset = main_pings.take(1000) | |
# In[13]: | |
# subset = sc.parallelize(subset) | |
# In[14]: | |
# single = sc.parallelize(main_pings.take(1)) | |
# In[15]: | |
# single.first() | |
# ### Utility functions | |
# In[16]: | |
def extract_probes(p): | |
branch = p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning") | |
output = [] | |
for path, probe_names in experiment_probes.iteritems(): | |
for probe_name in probe_names: | |
probe = recursive_get(p, path.split("/")).get(probe_name, {}) | |
#take all entries in histogram right now. this is probably problematic. inspect more | |
for k,v in probe.get("values",{}).iteritems(): | |
output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v)) | |
return output | |
# In[17]: | |
# get the bins we use for the histogram for a probe by looking at all branches | |
def get_bins(probe_name, logscale=False): | |
all_branches = [r.val for r in df.where(df.probe == probe_name) .collect()] | |
#remove top 0.5%, bottom 0.5% for easy outlier | |
trim = int(len(all_branches)/200.0) | |
all_branches_trimmed = sorted(all_branches) | |
all_branches_trimmed = all_branches_trimmed[trim:-1*trim] | |
if logscale: | |
if all_branches_trimmed[0] < 1: | |
all_branches_trimmed = py_map(lambda d: d+1, all_branches_trimmed) | |
return list(np.linspace(np.log10(all_branches_trimmed[1]), np.log10(all_branches_trimmed[-1]), 10)) | |
n,b = np.histogram(all_branches_trimmed,10) | |
return b | |
# In[27]: | |
# return (proceed bool, reason) | |
def can_chart_pref(pref_name): | |
n = df.where(df.probe == pref_name).count() | |
if n==0: | |
return (False, "0 entries for pref %s"%pref_name) | |
elif n>10000000: | |
return (False, "%i values for pref %s"%(n,pref_name)) | |
return (True, None) | |
# In[28]: | |
# get values for branch of experiment for pref, and trim off outliers | |
def get_vals(pref_name, branch): | |
x_vals = [r.val for r in df.where(df.probe == pref_name) .where(df.branch == branch) .collect()] | |
trim = int(len(x_vals)/200.0) | |
x_trimmed = sorted(x_vals)[trim:-1*trim] | |
return x_trimmed | |
# In[29]: | |
# return (pval, direction) if significant p value for mannwhitneyu vs control | |
def test_unequal(branch_vals, control_vals, p_threshold=.05): | |
try: | |
r = mannwhitneyu(branch_vals, control_vals) | |
except: | |
return None, None | |
prefix = "" | |
if r.pvalue < p_threshold: | |
prefix = "***" | |
if median(branch_vals) > median(control_vals): | |
return (prefix + str(r.pvalue), "> control") | |
return (prefix + str(r.pvalue), "< control") | |
# In[43]: | |
# chart histograms for all branches of a probe, log/std, and calculate if any branches vary from the mean | |
def chart_pref(pref_name, logscale): | |
sig_branches = [] | |
fig, axarr = plt.subplots(n_branches, 1, sharex=True) | |
b = get_bins(pref_name, logscale) | |
plt.tight_layout() | |
print pref_name + " (logscale=" + str(logscale) + ")" + ":" | |
control_vals = get_vals(pref_name, "gecko") | |
for i in range(n_branches): | |
if branches[i] == "control": | |
x = control_vals | |
else: | |
x = get_vals(pref_name, branches[i]) | |
if logscale: #always assume 0 as lowest val for now | |
x_trans = py_map(lambda d: d+1, x) | |
ap,bp,cp = axarr[i].hist(np.log(x_trans), bins=b) | |
else: | |
axarr[i].hist(x, bins=b) | |
axarr[i].set_title(branches[i]) | |
if branches[i] == "gecko": continue | |
p, direction = test_unequal(x, control_vals) | |
if p is not None and p.startswith("***"): | |
print branches[i], p #, direction | |
plt.show() | |
# # Results | |
# In[44]: | |
probe_dicts = main_pings.flatMap(extract_probes) | |
# In[45]: | |
df = sqlContext.createDataFrame(probe_dicts.map(lambda d: Row(**d))) | |
# In[46]: | |
#check on branch distribution | |
for b,v in main_pings.map(lambda x: x.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")).countByValue().iteritems(): | |
print b, ":", v | |
# In[47]: | |
branches = [i.branch for i in df.select("branch").distinct().collect()] | |
n_branches = len(branches) | |
#for this one - set order | |
branches = ["gecko", "stylo"] | |
# In[48]: | |
for p in probe_names: | |
can_chart, reason = can_chart_pref(p) | |
if can_chart: | |
chart_pref(p, False) | |
chart_pref(p, True) | |
else: | |
print reason | |
# In[ ]: | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment