mreid-moz/MainSummaryExample.ipynb Secret

## MainSummaryExample.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MainSummary Dataset example\n",
    "\n",
    "Documentation can be found [here](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/MainSummary.md). Dataset generation code is [here](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/MainSummaryView.scala).\n",
    "\n",
    "Details and requirements are in [Bug 1260847](https://bugzilla.mozilla.org/show_bug.cgi?id=1260847). See also [Bug 1254716](https://bugzilla.mozilla.org/show_bug.cgi?id=1254716). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# How many cores are we running on? \n",
    "sc.defaultParallelism"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read source data\n",
    "\n",
    "Read the data from the parquet datastore on S3."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
      "Wall time: 18.7 s\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import SQLContext\n",
    "from pyspark.sql.types import *\n",
    "\n",
    "bucket = \"telemetry-parquet\"\n",
    "prefix = \"main_summary/v3\"\n",
    "%time dataset = sqlContext.read.load(\"s3://{}/{}\".format(bucket, prefix), \"parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- document_id: string (nullable = true)\n",
      " |-- client_id: string (nullable = true)\n",
      " |-- channel: string (nullable = true)\n",
      " |-- normalized_channel: string (nullable = true)\n",
      " |-- country: string (nullable = true)\n",
      " |-- city: string (nullable = true)\n",
      " |-- os: string (nullable = true)\n",
      " |-- os_version: string (nullable = true)\n",
      " |-- os_service_pack_major: string (nullable = true)\n",
      " |-- os_service_pack_minor: string (nullable = true)\n",
      " |-- profile_creation_date: long (nullable = true)\n",
      " |-- subsession_start_date: string (nullable = true)\n",
      " |-- subsession_length: long (nullable = true)\n",
      " |-- distribution_id: string (nullable = true)\n",
      " |-- submission_date: string (nullable = true)\n",
      " |-- sync_configured: boolean (nullable = true)\n",
      " |-- sync_count_desktop: integer (nullable = true)\n",
      " |-- sync_count_mobile: integer (nullable = true)\n",
      " |-- app_build_id: string (nullable = true)\n",
      " |-- app_display_version: string (nullable = true)\n",
      " |-- app_name: string (nullable = true)\n",
      " |-- app_version: string (nullable = true)\n",
      " |-- timestamp: long (nullable = true)\n",
      " |-- env_build_id: string (nullable = true)\n",
      " |-- env_build_version: string (nullable = true)\n",
      " |-- env_build_arch: string (nullable = true)\n",
      " |-- e10s_enabled: boolean (nullable = true)\n",
      " |-- e10s_cohort: string (nullable = true)\n",
      " |-- locale: string (nullable = true)\n",
      " |-- active_experiment_id: string (nullable = true)\n",
      " |-- active_experiment_branch: string (nullable = true)\n",
      " |-- reason: string (nullable = true)\n",
      " |-- timezone_offset: integer (nullable = true)\n",
      " |-- plugin_hangs: integer (nullable = true)\n",
      " |-- aborts_plugin: integer (nullable = true)\n",
      " |-- aborts_content: integer (nullable = true)\n",
      " |-- aborts_gmplugin: integer (nullable = true)\n",
      " |-- crashes_detected_plugin: integer (nullable = true)\n",
      " |-- crashes_detected_content: integer (nullable = true)\n",
      " |-- crashes_detected_gmplugin: integer (nullable = true)\n",
      " |-- crash_submit_attempt_main: integer (nullable = true)\n",
      " |-- crash_submit_attempt_content: integer (nullable = true)\n",
      " |-- crash_submit_attempt_plugin: integer (nullable = true)\n",
      " |-- crash_submit_success_main: integer (nullable = true)\n",
      " |-- crash_submit_success_content: integer (nullable = true)\n",
      " |-- crash_submit_success_plugin: integer (nullable = true)\n",
      " |-- active_addons_count: long (nullable = true)\n",
      " |-- flash_version: string (nullable = true)\n",
      " |-- vendor: string (nullable = true)\n",
      " |-- is_default_browser: boolean (nullable = true)\n",
      " |-- default_search_engine_data_name: string (nullable = true)\n",
      " |-- default_search_engine: string (nullable = true)\n",
      " |-- loop_activity_counter: struct (nullable = true)\n",
      " |    |-- open_panel: integer (nullable = true)\n",
      " |    |-- open_conversation: integer (nullable = true)\n",
      " |    |-- room_open: integer (nullable = true)\n",
      " |    |-- room_share: integer (nullable = true)\n",
      " |    |-- room_delete: integer (nullable = true)\n",
      " |-- devtools_toolbox_opened_count: integer (nullable = true)\n",
      " |-- client_submission_date: string (nullable = true)\n",
      " |-- places_bookmarks_count: integer (nullable = true)\n",
      " |-- places_pages_count: integer (nullable = true)\n",
      " |-- push_api_notification_received: integer (nullable = true)\n",
      " |-- web_notification_shown: integer (nullable = true)\n",
      " |-- popup_notification_stats: map (nullable = true)\n",
      " |    |-- key: string\n",
      " |    |-- value: struct (valueContainsNull = true)\n",
      " |    |    |-- offered: integer (nullable = true)\n",
      " |    |    |-- action_1: integer (nullable = true)\n",
      " |    |    |-- action_2: integer (nullable = true)\n",
      " |    |    |-- action_3: integer (nullable = true)\n",
      " |    |    |-- action_last: integer (nullable = true)\n",
      " |    |    |-- dismissal_click_elsewhere: integer (nullable = true)\n",
      " |    |    |-- dismissal_leave_page: integer (nullable = true)\n",
      " |    |    |-- dismissal_close_button: integer (nullable = true)\n",
      " |    |    |-- dismissal_not_now: integer (nullable = true)\n",
      " |    |    |-- open_submenu: integer (nullable = true)\n",
      " |    |    |-- learn_more: integer (nullable = true)\n",
      " |    |    |-- reopen_offered: integer (nullable = true)\n",
      " |    |    |-- reopen_action_1: integer (nullable = true)\n",
      " |    |    |-- reopen_action_2: integer (nullable = true)\n",
      " |    |    |-- reopen_action_3: integer (nullable = true)\n",
      " |    |    |-- reopen_action_last: integer (nullable = true)\n",
      " |    |    |-- reopen_dismissal_click_elsewhere: integer (nullable = true)\n",
      " |    |    |-- reopen_dismissal_leave_page: integer (nullable = true)\n",
      " |    |    |-- reopen_dismissal_close_button: integer (nullable = true)\n",
      " |    |    |-- reopen_dismissal_not_now: integer (nullable = true)\n",
      " |    |    |-- reopen_open_submenu: integer (nullable = true)\n",
      " |    |    |-- reopen_learn_more: integer (nullable = true)\n",
      " |-- search_counts: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- engine: string (nullable = true)\n",
      " |    |    |-- source: string (nullable = true)\n",
      " |    |    |-- count: long (nullable = true)\n",
      " |-- submission_date_s3: string (nullable = true)\n",
      " |-- sample_id: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dataset.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Example query:\n",
    "#### How many unique clientIds took part in each Telemetry Experiment, reporting data between July 4 and July 7?\n",
    "\n",
    "First, filter for the target time range"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 16 ms, sys: 8 ms, total: 24 ms\n",
      "Wall time: 120 ms\n"
     ]
    }
   ],
   "source": [
    "%time dataset = dataset.filter(dataset.submission_date_s3 >= '20160704').filter(dataset.submission_date_s3 <= '20160706')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then filter for non-null `active_experiment_id`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 12 ms, sys: 12 ms, total: 24 ms\n",
      "Wall time: 95.6 ms\n"
     ]
    }
   ],
   "source": [
    "%time experiments = dataset.filter(dataset.active_experiment_id.isNotNull()).select(\"active_experiment_id\", \"client_id\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now group by experiment and count the unique `client_id`s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 28 ms, sys: 8 ms, total: 36 ms\n",
      "Wall time: 2min 15s\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql.functions import countDistinct\n",
    "%time grouped = experiments.groupby(\"active_experiment_id\").agg(countDistinct(experiments.client_id)).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(active_experiment_id=u'displayport-tuning-nightly@experiments.mozilla.org', count(client_id)=9),\n",
       " Row(active_experiment_id=u'unified-urlbar@experiments.mozilla.org', count(client_id)=322),\n",
       " Row(active_experiment_id=u'plugin-block-beta47@experiments.mozilla.org', count(client_id)=449),\n",
       " Row(active_experiment_id=u'e10s-beta45-withaddons@experiments.mozilla.org', count(client_id)=250),\n",
       " Row(active_experiment_id=u'e10s-beta46-noapz@experiments.mozilla.org', count(client_id)=271),\n",
       " Row(active_experiment_id=u'e10s-enabled-beta-20151214@experiments.mozilla.org', count(client_id)=83),\n",
       " Row(active_experiment_id=u'e10s-enabled-aurora-20151020@experiments.mozilla.org', count(client_id)=13),\n",
       " Row(active_experiment_id=u'e10s-beta45-withoutaddons@experiments.mozilla.org', count(client_id)=350)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grouped"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  },
  "widgets": {
   "state": {},
   "version": "1.1.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

## MainSummaryExample.py

# coding: utf-8

# # MainSummary Dataset example
#
# Documentation can be found [here](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/MainSummary.md). Dataset generation code is [here](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/MainSummaryView.scala).
#
# Details and requirements are in [Bug 1260847](https://bugzilla.mozilla.org/show_bug.cgi?id=1260847). See also [Bug 1254716](https://bugzilla.mozilla.org/show_bug.cgi?id=1254716).

# In[1]:

# How many cores are we running on?
sc.defaultParallelism


# ### Read source data
#
# Read the data from the parquet datastore on S3.

# In[2]:

from pyspark.sql import SQLContext
from pyspark.sql.types import *

bucket = "telemetry-parquet"
prefix = "main_summary/v3"
get_ipython().magic(u'time dataset = sqlContext.read.load("s3://{}/{}".format(bucket, prefix), "parquet")')


# In[3]:

dataset.printSchema()


# # Example query:
# #### How many unique clientIds took part in each Telemetry Experiment, reporting data between July 4 and July 7?
#
# First, filter for the target time range

# In[4]:

get_ipython().magic(u"time dataset = dataset.filter(dataset.submission_date_s3 >= '20160704').filter(dataset.submission_date_s3 <= '20160706')")


# Then filter for non-null `active_experiment_id`

# In[5]:

get_ipython().magic(u'time experiments = dataset.filter(dataset.active_experiment_id.isNotNull()).select("active_experiment_id", "client_id")')


# Now group by experiment and count the unique `client_id`s

# In[6]:

from pyspark.sql.functions import countDistinct
get_ipython().magic(u'time grouped = experiments.groupby("active_experiment_id").agg(countDistinct(experiments.client_id)).collect()')


# In[7]:

grouped
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# MainSummary Dataset example\n",
	"\n",
	"Documentation can be found [here](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/MainSummary.md). Dataset generation code is [here](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/MainSummaryView.scala).\n",
	"\n",
	"Details and requirements are in [Bug 1260847](https://bugzilla.mozilla.org/show_bug.cgi?id=1260847). See also [Bug 1254716](https://bugzilla.mozilla.org/show_bug.cgi?id=1254716). "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"32"
	]
	},
	"execution_count": 1,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# How many cores are we running on? \n",
	"sc.defaultParallelism"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Read source data\n",
	"\n",
	"Read the data from the parquet datastore on S3."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
	"Wall time: 18.7 s\n"
	]
	}
	],
	"source": [
	"from pyspark.sql import SQLContext\n",
	"from pyspark.sql.types import *\n",
	"\n",
	"bucket = \"telemetry-parquet\"\n",
	"prefix = \"main_summary/v3\"\n",
	"%time dataset = sqlContext.read.load(\"s3://{}/{}\".format(bucket, prefix), \"parquet\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"root\n",
	" \|-- document_id: string (nullable = true)\n",
	" \|-- client_id: string (nullable = true)\n",
	" \|-- channel: string (nullable = true)\n",
	" \|-- normalized_channel: string (nullable = true)\n",
	" \|-- country: string (nullable = true)\n",
	" \|-- city: string (nullable = true)\n",
	" \|-- os: string (nullable = true)\n",
	" \|-- os_version: string (nullable = true)\n",
	" \|-- os_service_pack_major: string (nullable = true)\n",
	" \|-- os_service_pack_minor: string (nullable = true)\n",
	" \|-- profile_creation_date: long (nullable = true)\n",
	" \|-- subsession_start_date: string (nullable = true)\n",
	" \|-- subsession_length: long (nullable = true)\n",
	" \|-- distribution_id: string (nullable = true)\n",
	" \|-- submission_date: string (nullable = true)\n",
	" \|-- sync_configured: boolean (nullable = true)\n",
	" \|-- sync_count_desktop: integer (nullable = true)\n",
	" \|-- sync_count_mobile: integer (nullable = true)\n",
	" \|-- app_build_id: string (nullable = true)\n",
	" \|-- app_display_version: string (nullable = true)\n",
	" \|-- app_name: string (nullable = true)\n",
	" \|-- app_version: string (nullable = true)\n",
	" \|-- timestamp: long (nullable = true)\n",
	" \|-- env_build_id: string (nullable = true)\n",
	" \|-- env_build_version: string (nullable = true)\n",
	" \|-- env_build_arch: string (nullable = true)\n",
	" \|-- e10s_enabled: boolean (nullable = true)\n",
	" \|-- e10s_cohort: string (nullable = true)\n",
	" \|-- locale: string (nullable = true)\n",
	" \|-- active_experiment_id: string (nullable = true)\n",
	" \|-- active_experiment_branch: string (nullable = true)\n",
	" \|-- reason: string (nullable = true)\n",
	" \|-- timezone_offset: integer (nullable = true)\n",
	" \|-- plugin_hangs: integer (nullable = true)\n",
	" \|-- aborts_plugin: integer (nullable = true)\n",
	" \|-- aborts_content: integer (nullable = true)\n",
	" \|-- aborts_gmplugin: integer (nullable = true)\n",
	" \|-- crashes_detected_plugin: integer (nullable = true)\n",
	" \|-- crashes_detected_content: integer (nullable = true)\n",
	" \|-- crashes_detected_gmplugin: integer (nullable = true)\n",
	" \|-- crash_submit_attempt_main: integer (nullable = true)\n",
	" \|-- crash_submit_attempt_content: integer (nullable = true)\n",
	" \|-- crash_submit_attempt_plugin: integer (nullable = true)\n",
	" \|-- crash_submit_success_main: integer (nullable = true)\n",
	" \|-- crash_submit_success_content: integer (nullable = true)\n",
	" \|-- crash_submit_success_plugin: integer (nullable = true)\n",
	" \|-- active_addons_count: long (nullable = true)\n",
	" \|-- flash_version: string (nullable = true)\n",
	" \|-- vendor: string (nullable = true)\n",
	" \|-- is_default_browser: boolean (nullable = true)\n",
	" \|-- default_search_engine_data_name: string (nullable = true)\n",
	" \|-- default_search_engine: string (nullable = true)\n",
	" \|-- loop_activity_counter: struct (nullable = true)\n",
	" \| \|-- open_panel: integer (nullable = true)\n",
	" \| \|-- open_conversation: integer (nullable = true)\n",
	" \| \|-- room_open: integer (nullable = true)\n",
	" \| \|-- room_share: integer (nullable = true)\n",
	" \| \|-- room_delete: integer (nullable = true)\n",
	" \|-- devtools_toolbox_opened_count: integer (nullable = true)\n",
	" \|-- client_submission_date: string (nullable = true)\n",
	" \|-- places_bookmarks_count: integer (nullable = true)\n",
	" \|-- places_pages_count: integer (nullable = true)\n",
	" \|-- push_api_notification_received: integer (nullable = true)\n",
	" \|-- web_notification_shown: integer (nullable = true)\n",
	" \|-- popup_notification_stats: map (nullable = true)\n",
	" \| \|-- key: string\n",
	" \| \|-- value: struct (valueContainsNull = true)\n",
	" \| \| \|-- offered: integer (nullable = true)\n",
	" \| \| \|-- action_1: integer (nullable = true)\n",
	" \| \| \|-- action_2: integer (nullable = true)\n",
	" \| \| \|-- action_3: integer (nullable = true)\n",
	" \| \| \|-- action_last: integer (nullable = true)\n",
	" \| \| \|-- dismissal_click_elsewhere: integer (nullable = true)\n",
	" \| \| \|-- dismissal_leave_page: integer (nullable = true)\n",
	" \| \| \|-- dismissal_close_button: integer (nullable = true)\n",
	" \| \| \|-- dismissal_not_now: integer (nullable = true)\n",
	" \| \| \|-- open_submenu: integer (nullable = true)\n",
	" \| \| \|-- learn_more: integer (nullable = true)\n",
	" \| \| \|-- reopen_offered: integer (nullable = true)\n",
	" \| \| \|-- reopen_action_1: integer (nullable = true)\n",
	" \| \| \|-- reopen_action_2: integer (nullable = true)\n",
	" \| \| \|-- reopen_action_3: integer (nullable = true)\n",
	" \| \| \|-- reopen_action_last: integer (nullable = true)\n",
	" \| \| \|-- reopen_dismissal_click_elsewhere: integer (nullable = true)\n",
	" \| \| \|-- reopen_dismissal_leave_page: integer (nullable = true)\n",
	" \| \| \|-- reopen_dismissal_close_button: integer (nullable = true)\n",
	" \| \| \|-- reopen_dismissal_not_now: integer (nullable = true)\n",
	" \| \| \|-- reopen_open_submenu: integer (nullable = true)\n",
	" \| \| \|-- reopen_learn_more: integer (nullable = true)\n",
	" \|-- search_counts: array (nullable = true)\n",
	" \| \|-- element: struct (containsNull = true)\n",
	" \| \| \|-- engine: string (nullable = true)\n",
	" \| \| \|-- source: string (nullable = true)\n",
	" \| \| \|-- count: long (nullable = true)\n",
	" \|-- submission_date_s3: string (nullable = true)\n",
	" \|-- sample_id: string (nullable = true)\n",
	"\n"
	]
	}
	],
	"source": [
	"dataset.printSchema()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Example query:\n",
	"#### How many unique clientIds took part in each Telemetry Experiment, reporting data between July 4 and July 7?\n",
	"\n",
	"First, filter for the target time range"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 16 ms, sys: 8 ms, total: 24 ms\n",
	"Wall time: 120 ms\n"
	]
	}
	],
	"source": [
	"%time dataset = dataset.filter(dataset.submission_date_s3 >= '20160704').filter(dataset.submission_date_s3 <= '20160706')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Then filter for non-null `active_experiment_id`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 12 ms, sys: 12 ms, total: 24 ms\n",
	"Wall time: 95.6 ms\n"
	]
	}
	],
	"source": [
	"%time experiments = dataset.filter(dataset.active_experiment_id.isNotNull()).select(\"active_experiment_id\", \"client_id\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Now group by experiment and count the unique `client_id`s"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 28 ms, sys: 8 ms, total: 36 ms\n",
	"Wall time: 2min 15s\n"
	]
	}
	],
	"source": [
	"from pyspark.sql.functions import countDistinct\n",
	"%time grouped = experiments.groupby(\"active_experiment_id\").agg(countDistinct(experiments.client_id)).collect()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[Row(active_experiment_id=u'displayport-tuning-nightly@experiments.mozilla.org', count(client_id)=9),\n",
	" Row(active_experiment_id=u'unified-urlbar@experiments.mozilla.org', count(client_id)=322),\n",
	" Row(active_experiment_id=u'plugin-block-beta47@experiments.mozilla.org', count(client_id)=449),\n",
	" Row(active_experiment_id=u'e10s-beta45-withaddons@experiments.mozilla.org', count(client_id)=250),\n",
	" Row(active_experiment_id=u'e10s-beta46-noapz@experiments.mozilla.org', count(client_id)=271),\n",
	" Row(active_experiment_id=u'e10s-enabled-beta-20151214@experiments.mozilla.org', count(client_id)=83),\n",
	" Row(active_experiment_id=u'e10s-enabled-aurora-20151020@experiments.mozilla.org', count(client_id)=13),\n",
	" Row(active_experiment_id=u'e10s-beta45-withoutaddons@experiments.mozilla.org', count(client_id)=350)]"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"grouped"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	},
	"widgets": {
	"state": {},
	"version": "1.1.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}

	# coding: utf-8

	# # MainSummary Dataset example
	#
	# Documentation can be found [here](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/MainSummary.md). Dataset generation code is [here](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/MainSummaryView.scala).
	#
	# Details and requirements are in [Bug 1260847](https://bugzilla.mozilla.org/show_bug.cgi?id=1260847). See also [Bug 1254716](https://bugzilla.mozilla.org/show_bug.cgi?id=1254716).

	# In[1]:

	# How many cores are we running on?
	sc.defaultParallelism


	# ### Read source data
	#
	# Read the data from the parquet datastore on S3.

	# In[2]:

	from pyspark.sql import SQLContext
	from pyspark.sql.types import *

	bucket = "telemetry-parquet"
	prefix = "main_summary/v3"
	get_ipython().magic(u'time dataset = sqlContext.read.load("s3://{}/{}".format(bucket, prefix), "parquet")')


	# In[3]:

	dataset.printSchema()


	# # Example query:
	# #### How many unique clientIds took part in each Telemetry Experiment, reporting data between July 4 and July 7?
	#
	# First, filter for the target time range

	# In[4]:

	get_ipython().magic(u"time dataset = dataset.filter(dataset.submission_date_s3 >= '20160704').filter(dataset.submission_date_s3 <= '20160706')")


	# Then filter for non-null `active_experiment_id`

	# In[5]:

	get_ipython().magic(u'time experiments = dataset.filter(dataset.active_experiment_id.isNotNull()).select("active_experiment_id", "client_id")')


	# Now group by experiment and count the unique `client_id`s

	# In[6]:

	from pyspark.sql.functions import countDistinct
	get_ipython().magic(u'time grouped = experiments.groupby("active_experiment_id").agg(countDistinct(experiments.client_id)).collect()')


	# In[7]:

	grouped