ilanasegall/stylo_metrics.ipynb Secret

## stylo_metrics.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              stylo_metrics.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## stylo_metrics.py

# coding: utf-8

# This script is intended for variables not yet available in the [Experiments Viewer](https://moz-experiments-viewer.herokuapp.com/).

# In[1]:

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
from collections import defaultdict as dd

from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
import itertools

from moztelemetry.dataset import Dataset

from scipy.stats import mannwhitneyu

py_max = __builtin__.max
py_map = __builtin__.map

get_ipython().magic(u'pylab inline')


# ### Define Experiment Variables

# In[2]:

experiment_probes = {}
experiment_probes["payload/histograms"] =  [
                "TIME_TO_DOM_INTERACTIVE_MS",
                "TIME_TO_DOM_COMPLETE_MS",
                "MEMORY_TOTAL",
                "MEMORY_UNIQUE",
                "MEMORY_VSIZE",
                "GHOST_WINDOWS",
                ]

experiment_probes["payload/processes/content/histograms"] = [
                "TIME_TO_DOM_LOADING_MS",
                 ]


# In[3]:

probe_names = list(itertools.chain(*experiment_probes.values()))


# To investigate:
#
# TOTAL_SCROLL_Y always = 0.0
# no entries for IDLE_NOTIFY_BACK_LISTENERS
#

# In[4]:

EXPERIMENT_SLUG = "pref-flip-quantum-css-style-r1-1381147"
START_DATE = "20170725" #YYYYMMDD
END_DATE = "20170823" #YYYYMMDD


# ### Simple Utils

# In[5]:

#modified from https://stackoverflow.com/questions/13852896/using-a-single-variable-to-index-into-nested-dictionaries
def recursive_get(d, keys):
    if len(keys) == 1:
        return d.get(keys[0],{})
    return recursive_get(d.get(keys[0],{}), keys[1:])


# In[6]:

def median(lst):
    return lst[(len(lst))/2]


# ### Get data

# In[7]:

cohorts = Dataset.from_source("telemetry-cohorts")
cohorts.schema


# Fetch experiment data

# In[8]:

pings = cohorts.where(submissionDate = lambda x: x >= START_DATE and x <= END_DATE)                .where(experimentId=EXPERIMENT_SLUG)                .records(sc, sample=.1)

pings.cache()
pings.count()


# What were the different ping types observed for this experiment on this day?

# In[9]:

# pings.map(lambda f: f["meta"].get("docType", "None")).countByValue()


# In[10]:

main_pings = pings.filter(lambda x: x["meta"]["docType"]=="main")


# In[11]:

main_pings.cache()


# #### (testing samples)

# In[12]:

# subset = main_pings.take(1000)


# In[13]:

# subset = sc.parallelize(subset)


# In[14]:

# single = sc.parallelize(main_pings.take(1))


# In[15]:

# single.first()


# ### Utility functions

# In[16]:

def extract_probes(p):
    branch = p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")

    output = []
    for path, probe_names in experiment_probes.iteritems():
        for probe_name in probe_names:
            probe = recursive_get(p, path.split("/")).get(probe_name, {})
            #take all entries in histogram right now. this is probably problematic. inspect more
            for k,v in probe.get("values",{}).iteritems():
                output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v))
    return output


# In[17]:

# get the bins we use for the histogram for a probe by looking at all branches
def get_bins(probe_name, logscale=False):

    all_branches = [r.val for r in df.where(df.probe == probe_name)                                 .collect()]

    #remove top 0.5%, bottom 0.5% for easy outlier
    trim = int(len(all_branches)/200.0)
    all_branches_trimmed = sorted(all_branches)
    all_branches_trimmed = all_branches_trimmed[trim:-1*trim]

    if logscale:
        if all_branches_trimmed[0] < 1:
            all_branches_trimmed = py_map(lambda d: d+1, all_branches_trimmed)
        return list(np.linspace(np.log10(all_branches_trimmed[1]), np.log10(all_branches_trimmed[-1]), 10))

    n,b = np.histogram(all_branches_trimmed,10)
    return b


# In[27]:

# return (proceed bool, reason)
def can_chart_pref(pref_name):
    n = df.where(df.probe == pref_name).count()
    if n==0:
        return (False, "0 entries for pref %s"%pref_name)
    elif n>10000000:
        return (False, "%i values for pref %s"%(n,pref_name))
    return (True, None)


# In[28]:

# get values for branch of experiment for pref, and trim off outliers
def get_vals(pref_name, branch):
    x_vals = [r.val for r in df.where(df.probe == pref_name)                                 .where(df.branch == branch)                                 .collect()]
    trim = int(len(x_vals)/200.0)
    x_trimmed = sorted(x_vals)[trim:-1*trim]
    return x_trimmed


# In[29]:

# return (pval, direction) if significant p value for mannwhitneyu vs control
def test_unequal(branch_vals, control_vals, p_threshold=.05):

    try:
        r = mannwhitneyu(branch_vals, control_vals)
    except:
        return None, None

    prefix = ""
    if r.pvalue < p_threshold:
        prefix = "***"

    if median(branch_vals) > median(control_vals):
        return (prefix + str(r.pvalue), "> control")

    return (prefix + str(r.pvalue), "< control")


# In[43]:

# chart histograms for all branches of a probe, log/std, and calculate if any branches vary from the mean
def chart_pref(pref_name, logscale):

    sig_branches = []
    fig, axarr = plt.subplots(n_branches, 1, sharex=True)

    b = get_bins(pref_name, logscale)
    plt.tight_layout()

    print pref_name + " (logscale=" + str(logscale) + ")" + ":"

    control_vals = get_vals(pref_name, "gecko")

    for i in range(n_branches):

        if branches[i] == "control":
            x = control_vals

        else:
            x = get_vals(pref_name, branches[i])

        if logscale: #always assume 0 as lowest val for now
            x_trans = py_map(lambda d: d+1, x)
            ap,bp,cp = axarr[i].hist(np.log(x_trans), bins=b)

        else:
            axarr[i].hist(x, bins=b)

        axarr[i].set_title(branches[i])

        if branches[i] == "gecko": continue

        p, direction = test_unequal(x, control_vals)
        if p is not None and p.startswith("***"):
            print branches[i], p #, direction

    plt.show()


# # Results

# In[44]:

probe_dicts = main_pings.flatMap(extract_probes)


# In[45]:

df = sqlContext.createDataFrame(probe_dicts.map(lambda d: Row(**d)))


# In[46]:

#check on branch distribution
for b,v in main_pings.map(lambda x: x.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")).countByValue().iteritems():
    print b, ":", v


# In[47]:

branches = [i.branch for i in df.select("branch").distinct().collect()]
n_branches = len(branches)

#for this one - set order
branches = ["gecko", "stylo"]


# In[48]:

for p in probe_names:
    can_chart, reason = can_chart_pref(p)
    if can_chart:
        chart_pref(p, False)
        chart_pref(p, True)
    else:
        print reason
    print


# In[ ]:


# In[ ]:

	# coding: utf-8

	# This script is intended for variables not yet available in the [Experiments Viewer](https://moz-experiments-viewer.herokuapp.com/).

	# In[1]:

	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import plotly.plotly as py
	from scipy.stats import chi2_contingency
	from scipy.stats import ttest_ind
	from collections import defaultdict as dd

	from pyspark.sql import Row
	from pyspark.sql import SQLContext
	from pyspark.sql.types import *
	from pyspark.sql.functions import *
	import itertools

	from moztelemetry.dataset import Dataset

	from scipy.stats import mannwhitneyu

	py_max = __builtin__.max
	py_map = __builtin__.map

	get_ipython().magic(u'pylab inline')


	# ### Define Experiment Variables

	# In[2]:

	experiment_probes = {}
	experiment_probes["payload/histograms"] = [
	"TIME_TO_DOM_INTERACTIVE_MS",
	"TIME_TO_DOM_COMPLETE_MS",
	"MEMORY_TOTAL",
	"MEMORY_UNIQUE",
	"MEMORY_VSIZE",
	"GHOST_WINDOWS",
	]

	experiment_probes["payload/processes/content/histograms"] = [
	"TIME_TO_DOM_LOADING_MS",
	]


	# In[3]:

	probe_names = list(itertools.chain(*experiment_probes.values()))


	# To investigate:
	#
	# TOTAL_SCROLL_Y always = 0.0
	# no entries for IDLE_NOTIFY_BACK_LISTENERS
	#

	# In[4]:

	EXPERIMENT_SLUG = "pref-flip-quantum-css-style-r1-1381147"
	START_DATE = "20170725" #YYYYMMDD
	END_DATE = "20170823" #YYYYMMDD


	# ### Simple Utils

	# In[5]:

	#modified from https://stackoverflow.com/questions/13852896/using-a-single-variable-to-index-into-nested-dictionaries
	def recursive_get(d, keys):
	if len(keys) == 1:
	return d.get(keys[0],{})
	return recursive_get(d.get(keys[0],{}), keys[1:])


	# In[6]:

	def median(lst):
	return lst[(len(lst))/2]


	# ### Get data

	# In[7]:

	cohorts = Dataset.from_source("telemetry-cohorts")
	cohorts.schema


	# Fetch experiment data

	# In[8]:

	pings = cohorts.where(submissionDate = lambda x: x >= START_DATE and x <= END_DATE) .where(experimentId=EXPERIMENT_SLUG) .records(sc, sample=.1)

	pings.cache()
	pings.count()


	# What were the different ping types observed for this experiment on this day?

	# In[9]:

	# pings.map(lambda f: f["meta"].get("docType", "None")).countByValue()


	# In[10]:

	main_pings = pings.filter(lambda x: x["meta"]["docType"]=="main")


	# In[11]:

	main_pings.cache()


	# #### (testing samples)

	# In[12]:

	# subset = main_pings.take(1000)


	# In[13]:

	# subset = sc.parallelize(subset)


	# In[14]:

	# single = sc.parallelize(main_pings.take(1))


	# In[15]:

	# single.first()


	# ### Utility functions

	# In[16]:

	def extract_probes(p):
	branch = p.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")

	output = []
	for path, probe_names in experiment_probes.iteritems():
	for probe_name in probe_names:
	probe = recursive_get(p, path.split("/")).get(probe_name, {})
	#take all entries in histogram right now. this is probably problematic. inspect more
	for k,v in probe.get("values",{}).iteritems():
	output.extend([{"probe": probe_name, "branch": branch, "val": float(k)}] * int(v))
	return output


	# In[17]:

	# get the bins we use for the histogram for a probe by looking at all branches
	def get_bins(probe_name, logscale=False):

	all_branches = [r.val for r in df.where(df.probe == probe_name) .collect()]

	#remove top 0.5%, bottom 0.5% for easy outlier
	trim = int(len(all_branches)/200.0)
	all_branches_trimmed = sorted(all_branches)
	all_branches_trimmed = all_branches_trimmed[trim:-1*trim]

	if logscale:
	if all_branches_trimmed[0] < 1:
	all_branches_trimmed = py_map(lambda d: d+1, all_branches_trimmed)
	return list(np.linspace(np.log10(all_branches_trimmed[1]), np.log10(all_branches_trimmed[-1]), 10))

	n,b = np.histogram(all_branches_trimmed,10)
	return b


	# In[27]:

	# return (proceed bool, reason)
	def can_chart_pref(pref_name):
	n = df.where(df.probe == pref_name).count()
	if n==0:
	return (False, "0 entries for pref %s"%pref_name)
	elif n>10000000:
	return (False, "%i values for pref %s"%(n,pref_name))
	return (True, None)


	# In[28]:

	# get values for branch of experiment for pref, and trim off outliers
	def get_vals(pref_name, branch):
	x_vals = [r.val for r in df.where(df.probe == pref_name) .where(df.branch == branch) .collect()]
	trim = int(len(x_vals)/200.0)
	x_trimmed = sorted(x_vals)[trim:-1*trim]
	return x_trimmed


	# In[29]:

	# return (pval, direction) if significant p value for mannwhitneyu vs control
	def test_unequal(branch_vals, control_vals, p_threshold=.05):

	try:
	r = mannwhitneyu(branch_vals, control_vals)
	except:
	return None, None

	prefix = ""
	if r.pvalue < p_threshold:
	prefix = "***"

	if median(branch_vals) > median(control_vals):
	return (prefix + str(r.pvalue), "> control")

	return (prefix + str(r.pvalue), "< control")


	# In[43]:

	# chart histograms for all branches of a probe, log/std, and calculate if any branches vary from the mean
	def chart_pref(pref_name, logscale):

	sig_branches = []
	fig, axarr = plt.subplots(n_branches, 1, sharex=True)

	b = get_bins(pref_name, logscale)
	plt.tight_layout()

	print pref_name + " (logscale=" + str(logscale) + ")" + ":"

	control_vals = get_vals(pref_name, "gecko")

	for i in range(n_branches):

	if branches[i] == "control":
	x = control_vals

	else:
	x = get_vals(pref_name, branches[i])

	if logscale: #always assume 0 as lowest val for now
	x_trans = py_map(lambda d: d+1, x)
	ap,bp,cp = axarr[i].hist(np.log(x_trans), bins=b)

	else:
	axarr[i].hist(x, bins=b)

	axarr[i].set_title(branches[i])

	if branches[i] == "gecko": continue

	p, direction = test_unequal(x, control_vals)
	if p is not None and p.startswith("***"):
	print branches[i], p #, direction

	plt.show()


	# # Results

	# In[44]:

	probe_dicts = main_pings.flatMap(extract_probes)


	# In[45]:

	df = sqlContext.createDataFrame(probe_dicts.map(lambda d: Row(**d)))


	# In[46]:

	#check on branch distribution
	for b,v in main_pings.map(lambda x: x.get("environment",{}).get("experiments",{}).get(EXPERIMENT_SLUG, {}).get("branch", "warning")).countByValue().iteritems():
	print b, ":", v


	# In[47]:

	branches = [i.branch for i in df.select("branch").distinct().collect()]
	n_branches = len(branches)

	#for this one - set order
	branches = ["gecko", "stylo"]


	# In[48]:

	for p in probe_names:
	can_chart, reason = can_chart_pref(p)
	if can_chart:
	chart_pref(p, False)
	chart_pref(p, True)
	else:
	print reason
	print


	# In[ ]:




	# In[ ]: