chutten/beta47_plugin_block.ipynb Secret

## beta47_plugin_block.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              beta47_plugin_block.ipynb
            
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## beta47_plugin_block.py

# coding: utf-8

# ### Plugin Block Experiment

# This is a very a brief introduction to Spark and Telemetry in Python. You should have a look at the [tutorial](https://gist.github.com/vitillo/25a20b7c8685c0c82422) in Scala and the associated [talk](http://www.slideshare.net/RobertoAgostinoVitil/spark-meets-telemetry) if you are interested to learn more about Spark.

# In[38]:

import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import IPython

from __future__ import division
from montecarlino import grouped_permutation_test

from moztelemetry.spark import get_pings, get_one_ping_per_client, get_pings_properties

get_ipython().magic(u'pylab inline')
IPython.core.pylabtools.figsize(16, 7)


# In[39]:

def chi2_distance(xs, ys, eps = 1e-10, normalize = True):
    histA = xs.sum(axis=0)
    histB = ys.sum(axis=0)

    if normalize:
        histA = histA/histA.sum()
        histB = histB/histB.sum()

    d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps)
        for (a, b) in zip(histA, histB)])

    return d

def median_diff(xs, ys):
    return np.median(xs) - np.median(ys)

def compare_histogram(histogram, e10s, none10s, branch_one, branch_two):
    # Normalize individual histograms
    e10s = e10s.map(lambda x: x/x.sum())
    none10s = none10s.map(lambda x: x/x.sum())

    pvalue = grouped_permutation_test(chi2_distance, [e10s, none10s], num_samples=100)

    eTotal = e10s.sum()
    nTotal = none10s.sum()

    eTotal = 100*eTotal/eTotal.sum()
    nTotal = 100*nTotal/nTotal.sum()

    fig = plt.figure()
    fig.subplots_adjust(hspace=0.3)

    ax = fig.add_subplot(1, 1, 1)
    ax2 = ax.twinx()
    width = 0.4
    ylim = max(eTotal.max(), nTotal.max())

    eTotal.plot(kind="bar", alpha=0.5, color="green", label="e10s", ax=ax, width=width, position=0, ylim=(0, ylim + 1))
    nTotal.plot(kind="bar", alpha=0.5, color="blue", label="non e10s", ax=ax2, width=width, position=1, grid=False, ylim=ax.get_ylim())

    ax.legend(ax.get_legend_handles_labels()[0] + ax2.get_legend_handles_labels()[0],
              ["{} ({} samples)".format(branch_one, len(e10s)), "{} ({} samples)".format(branch_two, len(none10s))],
             loc="best")

    # If there are more than 100 labels, hide every other one so we can still read them
    if len(ax.get_xticklabels()) > 100:
        for label in ax.get_xticklabels()[::2]:
            label.set_visible(False)

    plt.title(histogram)
    plt.xlabel(histogram)
    plt.ylabel("Frequency %")
    plt.show()

    print "The probability that the distributions for {} are differing by chance is {:.2f}.".format(histogram, pvalue)

def normalize_uptime_hour(frame):
    frame = frame[frame["payload/simpleMeasurements/uptime"] > 0]
    frame = 60 * frame.apply(lambda x: x/frame["payload/simpleMeasurements/uptime"]) # Metric per hour
    frame.drop('payload/simpleMeasurements/uptime', axis=1, inplace=True)
    return frame

def compare_count_histograms(pings, *histograms_names):
    properties = histograms_names + ("payload/simpleMeasurements/uptime", "e10s")

    frame = pd.DataFrame(get_pings_properties(pings, properties).collect())

    e10s = frame[frame["e10s"] == True]
    e10s = normalize_uptime_hour(e10s)

    none10s = frame[frame["e10s"] == False]
    none10s = normalize_uptime_hour(none10s)

    for histogram in e10s.columns:
        if histogram == "e10s" or histogram.endswith("_parent") or histogram.endswith("_children"):
            continue

        compare_scalars(histogram + " per hour", e10s[histogram].dropna(), none10s[histogram].dropna())


def compare_histograms(pings, branch_one, branch_two, *histogram_names):
    frame = pd.DataFrame(get_pings_properties(pings, histogram_names + ("environment/addons/activeExperiment/branch",)).collect())
    e10s = frame[frame["environment/addons/activeExperiment/branch"] == branch_one]
    none10s = frame[frame["environment/addons/activeExperiment/branch"] == branch_two]

    for histogram in none10s.columns:
        if histogram == "environment/addons/activeExperiment/branch":
            continue

        has_one = np.sum(e10s[histogram].notnull()) > 0
        has_two = np.sum(none10s[histogram].notnull()) > 0

        if has_one and has_two:
            compare_histogram(histogram, e10s[histogram].dropna(), none10s[histogram].dropna(), branch_one, branch_two)

def compare_scalars(metric, *groups):
    print "Median difference in {} is {:.2f}, ({:.2f}, {:.2f}).".format(metric,
                                                                        median_diff(*groups),
                                                                        np.median(groups[0]),
                                                                        np.median(groups[1]))
    print "The probability of this effect being purely by chance is {:.2f}.".         format(grouped_permutation_test(median_diff, groups, num_samples=10000))


# In[40]:

sc.defaultParallelism


# In[64]:

pings = get_pings(sc, app="Firefox", channel="beta", version="47.0", build_id=("20160510000000", "20160517999999"), fraction=0.5)


# In[68]:

def experiment(p):
    return p.get("environment", {}).get("addons", {}).get("activeExperiment", {})


# In[69]:

participants = pings.filter(lambda p: experiment(p).get("id", None) == "plugin-block-beta47@experiments.mozilla.org" and experiment(p).get("branch", None) is not None)


# In[70]:

participants.map(lambda p: (experiment(p).get("branch", None), p)).countByKey()


# In[71]:

subset = get_one_ping_per_client(participants)


# In[72]:

compare_histograms(subset,
                   "aggressive",
                   "control",
                   "payload/keyedHistograms/BLOCKED_ON_PLUGIN_INSTANCE_DESTROY_MS/Shockwave Flash21.0.0.242",
                   "payload/keyedHistograms/BLOCKED_ON_PLUGIN_INSTANCE_INIT_MS/Shockwave Flash21.0.0.242",
                   "payload/keyedHistograms/BLOCKED_ON_PLUGIN_MODULE_INIT_MS/Shockwave Flash21.0.0.242",
                   "payload/keyedHistograms/BLOCKED_ON_PLUGIN_STREAM_INIT_MS/Shockwave Flash21.0.0.242"
                  )


# In[73]:

compare_histograms(subset,
                   "aggressive",
                   "control",
                   "payload/histograms/FLASH_PLUGIN_AREA",
                   "payload/histograms/FLASH_PLUGIN_HEIGHT",
                   "payload/histograms/FLASH_PLUGIN_WIDTH"
                  )


# In[74]:

compare_histograms(subset,
                   "aggressive",
                   "control",
                   # "payload/histograms/PLUGIN_BLOCKED_FOR_STABILITY", # is 0 for control.
                   "payload/histograms/INPUT_EVENT_RESPONSE_MS",
                   "payload/histograms/FLASH_PLUGIN_INSTANCES_ON_PAGE",
                   "payload/histograms/FX_PAGE_LOAD_MS"
                  )


# In[75]:

compare_histograms(subset,
                   "aggressive",
                   "control",
                   #"payload/keyedHistograms/SUBPROCESS_CRASHES_WITH_DUMP/plugin",
                   #"payload/keyedHistograms/SUBPROCESS_ABNORMAL_ABORT/plugin"
                  )


# In[76]:

compare_histograms(subset,
                   "aggressive",
                   "control",
                   #"payload/histograms/PLUGIN_HANG_NOTICE_COUNT",
                   "payload/histograms/PLUGIN_HANG_TIME",
                   "payload/histograms/PLUGIN_HANG_UI_RESPONSE_TIME",
                   "payload/histograms/PLUGIN_HANG_UI_USER_RESPONSE"
                  )


# In[79]:

compare_histograms(subset,
                  "aggressive",
                  "control",
                  #"payload/keyedHistograms/PLUGIN_ACTIVATION_COUNT/flash",
                  #"payload/keyedHistograms/PLUGIN_ACTIVATION_COUNT/java"
                  )


# In[78]:

compare_histograms(subset,
                  "aggressive",
                  "control",
                  "payload/histograms/HTTP_REQUEST_PER_PAGE",
                  "payload/histograms/PLUGIN_TINY_CONTENT"
                  )

	# coding: utf-8

	# ### Plugin Block Experiment

	# This is a very a brief introduction to Spark and Telemetry in Python. You should have a look at the [tutorial](https://gist.github.com/vitillo/25a20b7c8685c0c82422) in Scala and the associated [talk](http://www.slideshare.net/RobertoAgostinoVitil/spark-meets-telemetry) if you are interested to learn more about Spark.

	# In[38]:

	import ujson as json
	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import plotly.plotly as py
	import IPython

	from __future__ import division
	from montecarlino import grouped_permutation_test

	from moztelemetry.spark import get_pings, get_one_ping_per_client, get_pings_properties

	get_ipython().magic(u'pylab inline')
	IPython.core.pylabtools.figsize(16, 7)


	# In[39]:

	def chi2_distance(xs, ys, eps = 1e-10, normalize = True):
	histA = xs.sum(axis=0)
	histB = ys.sum(axis=0)

	if normalize:
	histA = histA/histA.sum()
	histB = histB/histB.sum()

	d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps)
	for (a, b) in zip(histA, histB)])

	return d

	def median_diff(xs, ys):
	return np.median(xs) - np.median(ys)

	def compare_histogram(histogram, e10s, none10s, branch_one, branch_two):
	# Normalize individual histograms
	e10s = e10s.map(lambda x: x/x.sum())
	none10s = none10s.map(lambda x: x/x.sum())

	pvalue = grouped_permutation_test(chi2_distance, [e10s, none10s], num_samples=100)

	eTotal = e10s.sum()
	nTotal = none10s.sum()

	eTotal = 100*eTotal/eTotal.sum()
	nTotal = 100*nTotal/nTotal.sum()

	fig = plt.figure()
	fig.subplots_adjust(hspace=0.3)

	ax = fig.add_subplot(1, 1, 1)
	ax2 = ax.twinx()
	width = 0.4
	ylim = max(eTotal.max(), nTotal.max())

	eTotal.plot(kind="bar", alpha=0.5, color="green", label="e10s", ax=ax, width=width, position=0, ylim=(0, ylim + 1))
	nTotal.plot(kind="bar", alpha=0.5, color="blue", label="non e10s", ax=ax2, width=width, position=1, grid=False, ylim=ax.get_ylim())

	ax.legend(ax.get_legend_handles_labels()[0] + ax2.get_legend_handles_labels()[0],
	["{} ({} samples)".format(branch_one, len(e10s)), "{} ({} samples)".format(branch_two, len(none10s))],
	loc="best")

	# If there are more than 100 labels, hide every other one so we can still read them
	if len(ax.get_xticklabels()) > 100:
	for label in ax.get_xticklabels()[::2]:
	label.set_visible(False)

	plt.title(histogram)
	plt.xlabel(histogram)
	plt.ylabel("Frequency %")
	plt.show()

	print "The probability that the distributions for {} are differing by chance is {:.2f}.".format(histogram, pvalue)

	def normalize_uptime_hour(frame):
	frame = frame[frame["payload/simpleMeasurements/uptime"] > 0]
	frame = 60 * frame.apply(lambda x: x/frame["payload/simpleMeasurements/uptime"]) # Metric per hour
	frame.drop('payload/simpleMeasurements/uptime', axis=1, inplace=True)
	return frame

	def compare_count_histograms(pings, *histograms_names):
	properties = histograms_names + ("payload/simpleMeasurements/uptime", "e10s")

	frame = pd.DataFrame(get_pings_properties(pings, properties).collect())

	e10s = frame[frame["e10s"] == True]
	e10s = normalize_uptime_hour(e10s)

	none10s = frame[frame["e10s"] == False]
	none10s = normalize_uptime_hour(none10s)

	for histogram in e10s.columns:
	if histogram == "e10s" or histogram.endswith("_parent") or histogram.endswith("_children"):
	continue

	compare_scalars(histogram + " per hour", e10s[histogram].dropna(), none10s[histogram].dropna())


	def compare_histograms(pings, branch_one, branch_two, *histogram_names):
	frame = pd.DataFrame(get_pings_properties(pings, histogram_names + ("environment/addons/activeExperiment/branch",)).collect())
	e10s = frame[frame["environment/addons/activeExperiment/branch"] == branch_one]
	none10s = frame[frame["environment/addons/activeExperiment/branch"] == branch_two]

	for histogram in none10s.columns:
	if histogram == "environment/addons/activeExperiment/branch":
	continue

	has_one = np.sum(e10s[histogram].notnull()) > 0
	has_two = np.sum(none10s[histogram].notnull()) > 0

	if has_one and has_two:
	compare_histogram(histogram, e10s[histogram].dropna(), none10s[histogram].dropna(), branch_one, branch_two)

	def compare_scalars(metric, *groups):
	print "Median difference in {} is {:.2f}, ({:.2f}, {:.2f}).".format(metric,
	median_diff(*groups),
	np.median(groups[0]),
	np.median(groups[1]))
	print "The probability of this effect being purely by chance is {:.2f}.". format(grouped_permutation_test(median_diff, groups, num_samples=10000))


	# In[40]:

	sc.defaultParallelism


	# In[64]:

	pings = get_pings(sc, app="Firefox", channel="beta", version="47.0", build_id=("20160510000000", "20160517999999"), fraction=0.5)


	# In[68]:

	def experiment(p):
	return p.get("environment", {}).get("addons", {}).get("activeExperiment", {})


	# In[69]:

	participants = pings.filter(lambda p: experiment(p).get("id", None) == "plugin-block-beta47@experiments.mozilla.org" and experiment(p).get("branch", None) is not None)


	# In[70]:

	participants.map(lambda p: (experiment(p).get("branch", None), p)).countByKey()


	# In[71]:

	subset = get_one_ping_per_client(participants)


	# In[72]:

	compare_histograms(subset,
	"aggressive",
	"control",
	"payload/keyedHistograms/BLOCKED_ON_PLUGIN_INSTANCE_DESTROY_MS/Shockwave Flash21.0.0.242",
	"payload/keyedHistograms/BLOCKED_ON_PLUGIN_INSTANCE_INIT_MS/Shockwave Flash21.0.0.242",
	"payload/keyedHistograms/BLOCKED_ON_PLUGIN_MODULE_INIT_MS/Shockwave Flash21.0.0.242",
	"payload/keyedHistograms/BLOCKED_ON_PLUGIN_STREAM_INIT_MS/Shockwave Flash21.0.0.242"
	)


	# In[73]:

	compare_histograms(subset,
	"aggressive",
	"control",
	"payload/histograms/FLASH_PLUGIN_AREA",
	"payload/histograms/FLASH_PLUGIN_HEIGHT",
	"payload/histograms/FLASH_PLUGIN_WIDTH"
	)


	# In[74]:

	compare_histograms(subset,
	"aggressive",
	"control",
	# "payload/histograms/PLUGIN_BLOCKED_FOR_STABILITY", # is 0 for control.
	"payload/histograms/INPUT_EVENT_RESPONSE_MS",
	"payload/histograms/FLASH_PLUGIN_INSTANCES_ON_PAGE",
	"payload/histograms/FX_PAGE_LOAD_MS"
	)


	# In[75]:

	compare_histograms(subset,
	"aggressive",
	"control",
	#"payload/keyedHistograms/SUBPROCESS_CRASHES_WITH_DUMP/plugin",
	#"payload/keyedHistograms/SUBPROCESS_ABNORMAL_ABORT/plugin"
	)


	# In[76]:

	compare_histograms(subset,
	"aggressive",
	"control",
	#"payload/histograms/PLUGIN_HANG_NOTICE_COUNT",
	"payload/histograms/PLUGIN_HANG_TIME",
	"payload/histograms/PLUGIN_HANG_UI_RESPONSE_TIME",
	"payload/histograms/PLUGIN_HANG_UI_USER_RESPONSE"
	)


	# In[79]:

	compare_histograms(subset,
	"aggressive",
	"control",
	#"payload/keyedHistograms/PLUGIN_ACTIVATION_COUNT/flash",
	#"payload/keyedHistograms/PLUGIN_ACTIVATION_COUNT/java"
	)


	# In[78]:

	compare_histograms(subset,
	"aggressive",
	"control",
	"payload/histograms/HTTP_REQUEST_PER_PAGE",
	"payload/histograms/PLUGIN_TINY_CONTENT"
	)