mreid-moz/MainSummaryExample.ipynb Secret

## MainSummaryExample.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              MainSummaryExample.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## MainSummaryExample.py

# coding: utf-8

# # MainSummary Dataset example
#
# Documentation can be found [here](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/MainSummary.md). Dataset generation code is [here](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/MainSummaryView.scala).
#
# Details and requirements are in [Bug 1260847](https://bugzilla.mozilla.org/show_bug.cgi?id=1260847). See also [Bug 1254716](https://bugzilla.mozilla.org/show_bug.cgi?id=1254716).

# In[1]:

# How many cores are we running on?
sc.defaultParallelism


# ### Read source data
#
# Read the data from the parquet datastore on S3.

# In[2]:

from pyspark.sql import SQLContext
from pyspark.sql.types import *

bucket = "telemetry-parquet"
prefix = "main_summary/v3"
get_ipython().magic(u'time dataset = sqlContext.read.load("s3://{}/{}".format(bucket, prefix), "parquet")')


# In[3]:

dataset.printSchema()


# # Example query:
# #### How many unique clientIds took part in each Telemetry Experiment, reporting data between July 4 and July 7?
#
# First, filter for the target time range

# In[4]:

get_ipython().magic(u"time dataset = dataset.filter(dataset.submission_date_s3 >= '20160704').filter(dataset.submission_date_s3 <= '20160706')")


# Then filter for non-null `active_experiment_id`

# In[5]:

get_ipython().magic(u'time experiments = dataset.filter(dataset.active_experiment_id.isNotNull()).select("active_experiment_id", "client_id")')


# Now group by experiment and count the unique `client_id`s

# In[6]:

from pyspark.sql.functions import countDistinct
get_ipython().magic(u'time grouped = experiments.groupby("active_experiment_id").agg(countDistinct(experiments.client_id)).collect()')


# In[7]:

grouped

	# coding: utf-8

	# # MainSummary Dataset example
	#
	# Documentation can be found [here](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/MainSummary.md). Dataset generation code is [here](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/MainSummaryView.scala).
	#
	# Details and requirements are in [Bug 1260847](https://bugzilla.mozilla.org/show_bug.cgi?id=1260847). See also [Bug 1254716](https://bugzilla.mozilla.org/show_bug.cgi?id=1254716).

	# In[1]:

	# How many cores are we running on?
	sc.defaultParallelism


	# ### Read source data
	#
	# Read the data from the parquet datastore on S3.

	# In[2]:

	from pyspark.sql import SQLContext
	from pyspark.sql.types import *

	bucket = "telemetry-parquet"
	prefix = "main_summary/v3"
	get_ipython().magic(u'time dataset = sqlContext.read.load("s3://{}/{}".format(bucket, prefix), "parquet")')


	# In[3]:

	dataset.printSchema()


	# # Example query:
	# #### How many unique clientIds took part in each Telemetry Experiment, reporting data between July 4 and July 7?
	#
	# First, filter for the target time range

	# In[4]:

	get_ipython().magic(u"time dataset = dataset.filter(dataset.submission_date_s3 >= '20160704').filter(dataset.submission_date_s3 <= '20160706')")


	# Then filter for non-null `active_experiment_id`

	# In[5]:

	get_ipython().magic(u'time experiments = dataset.filter(dataset.active_experiment_id.isNotNull()).select("active_experiment_id", "client_id")')


	# Now group by experiment and count the unique `client_id`s

	# In[6]:

	from pyspark.sql.functions import countDistinct
	get_ipython().magic(u'time grouped = experiments.groupby("active_experiment_id").agg(countDistinct(experiments.client_id)).collect()')


	# In[7]:

	grouped