chutten/beta47_slow_script.ipynb Secret

## beta47_slow_script.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              beta47_slow_script.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## beta47_slow_script.py

# coding: utf-8

# ### Beta 47 Slow Script

# This is a very a brief introduction to Spark and Telemetry in Python. You should have a look at the [tutorial](https://gist.github.com/vitillo/25a20b7c8685c0c82422) in Scala and the associated [talk](http://www.slideshare.net/RobertoAgostinoVitil/spark-meets-telemetry) if you are interested to learn more about Spark.

# In[1]:

import numbers
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from plotly.graph_objs import *

from montecarlino import grouped_permutation_test
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history, get_records

get_ipython().magic(u'pylab inline')


# In[2]:

sc.defaultParallelism


# In[15]:

pings = get_pings(sc, app="Firefox", channel="beta", version="47.0", fraction=0.25)


# In[38]:

subset = get_pings_properties(pings, ["clientId",
                                      "environment/settings/userPrefs/dom.max_script_run_time",
                                      "environment/settings/e10sCohort",
                                      "payload/simpleMeasurements/uptime",
                                      "payload/histograms/SLOW_SCRIPT_PAGE_COUNT"])


# In[43]:

subset = subset.filter(lambda p: p["environment/settings/e10sCohort"] in ["test", "control"])


# In[44]:

subset = get_one_ping_per_client(subset)


# In[45]:

cached = subset.cache()


# How many pings are we looking at?

# In[46]:

cached.count()


# In[47]:

cached.map(lambda p: (p["environment/settings/userPrefs/dom.max_script_run_time"], p)).countByKey()


# In[49]:

frame = pd.DataFrame(cached.collect())
e10s = frame[frame["environment/settings/e10sCohort"] == "test"]
none10s = frame[frame["environment/settings/e10sCohort"] == "control"]

e10s.count(), none10s.count()


# In[50]:

def normalize_uptime_hour(frame, metric):
    frame = frame[frame["payload/simpleMeasurements/uptime"] > 0]
    frame = frame[frame[metric] >= 0]
    frame[metric] = 60 * frame[metric] / frame["payload/simpleMeasurements/uptime"]
    return frame

e10s_norm = normalize_uptime_hour(e10s, "payload/histograms/SLOW_SCRIPT_PAGE_COUNT")
none10s_norm = normalize_uptime_hour(none10s, "payload/histograms/SLOW_SCRIPT_PAGE_COUNT")


# In[51]:

def median_diff(xs, ys):
    return np.median(xs) - np.median(ys)

def compare_scalars(metric, *groups):
    print "Median difference in {} is {:.2f}, ({:.2f}, {:.2f}).".format(metric,
                                                                        median_diff(*groups),
                                                                        np.median(groups[0]),
                                                                        np.median(groups[1]))
    print "The probability of this effect being purely by chance is {:.2f}.".         format(grouped_permutation_test(median_diff, groups, num_samples=10000))


# In[52]:

metric = "payload/histograms/SLOW_SCRIPT_PAGE_COUNT"
compare_scalars(metric, e10s_norm[metric], none10s_norm[metric])


# In[53]:

pref = "environment/settings/userPrefs/dom.max_script_run_time"
compare_scalars(metric, e10s_norm[e10s_norm[pref].isnull()][metric], none10s_norm[none10s_norm[pref].isnull()][metric])

	# coding: utf-8

	# ### Beta 47 Slow Script

	# This is a very a brief introduction to Spark and Telemetry in Python. You should have a look at the [tutorial](https://gist.github.com/vitillo/25a20b7c8685c0c82422) in Scala and the associated [talk](http://www.slideshare.net/RobertoAgostinoVitil/spark-meets-telemetry) if you are interested to learn more about Spark.

	# In[1]:

	import numbers
	import ujson as json
	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import plotly.plotly as py
	from plotly.graph_objs import *

	from montecarlino import grouped_permutation_test
	from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history, get_records

	get_ipython().magic(u'pylab inline')


	# In[2]:

	sc.defaultParallelism


	# In[15]:

	pings = get_pings(sc, app="Firefox", channel="beta", version="47.0", fraction=0.25)


	# In[38]:

	subset = get_pings_properties(pings, ["clientId",
	"environment/settings/userPrefs/dom.max_script_run_time",
	"environment/settings/e10sCohort",
	"payload/simpleMeasurements/uptime",
	"payload/histograms/SLOW_SCRIPT_PAGE_COUNT"])


	# In[43]:

	subset = subset.filter(lambda p: p["environment/settings/e10sCohort"] in ["test", "control"])


	# In[44]:

	subset = get_one_ping_per_client(subset)


	# In[45]:

	cached = subset.cache()


	# How many pings are we looking at?

	# In[46]:

	cached.count()


	# In[47]:

	cached.map(lambda p: (p["environment/settings/userPrefs/dom.max_script_run_time"], p)).countByKey()


	# In[49]:

	frame = pd.DataFrame(cached.collect())
	e10s = frame[frame["environment/settings/e10sCohort"] == "test"]
	none10s = frame[frame["environment/settings/e10sCohort"] == "control"]

	e10s.count(), none10s.count()


	# In[50]:

	def normalize_uptime_hour(frame, metric):
	frame = frame[frame["payload/simpleMeasurements/uptime"] > 0]
	frame = frame[frame[metric] >= 0]
	frame[metric] = 60 * frame[metric] / frame["payload/simpleMeasurements/uptime"]
	return frame

	e10s_norm = normalize_uptime_hour(e10s, "payload/histograms/SLOW_SCRIPT_PAGE_COUNT")
	none10s_norm = normalize_uptime_hour(none10s, "payload/histograms/SLOW_SCRIPT_PAGE_COUNT")


	# In[51]:

	def median_diff(xs, ys):
	return np.median(xs) - np.median(ys)

	def compare_scalars(metric, *groups):
	print "Median difference in {} is {:.2f}, ({:.2f}, {:.2f}).".format(metric,
	median_diff(*groups),
	np.median(groups[0]),
	np.median(groups[1]))
	print "The probability of this effect being purely by chance is {:.2f}.". format(grouped_permutation_test(median_diff, groups, num_samples=10000))


	# In[52]:

	metric = "payload/histograms/SLOW_SCRIPT_PAGE_COUNT"
	compare_scalars(metric, e10s_norm[metric], none10s_norm[metric])


	# In[53]:

	pref = "environment/settings/userPrefs/dom.max_script_run_time"
	compare_scalars(metric, e10s_norm[e10s_norm[pref].isnull()][metric], none10s_norm[none10s_norm[pref].isnull()][metric])