Skip to content

Instantly share code, notes, and snippets.

@mreid-moz
Last active January 23, 2017 13:42
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mreid-moz/518f7515aac54cd246635c333683ecce to your computer and use it in GitHub Desktop.
Save mreid-moz/518f7515aac54cd246635c333683ecce to your computer and use it in GitHub Desktop.
MainSummaryExample
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MainSummary Dataset example\n",
"\n",
"Documentation can be found [here](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/MainSummary.md). Dataset generation code is [here](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/MainSummaryView.scala).\n",
"\n",
"Details and requirements are in [Bug 1260847](https://bugzilla.mozilla.org/show_bug.cgi?id=1260847). See also [Bug 1254716](https://bugzilla.mozilla.org/show_bug.cgi?id=1254716). "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"32"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# How many cores are we running on? \n",
"sc.defaultParallelism"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Read source data\n",
"\n",
"Read the data from the parquet datastore on S3."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n",
"Wall time: 18.7 s\n"
]
}
],
"source": [
"from pyspark.sql import SQLContext\n",
"from pyspark.sql.types import *\n",
"\n",
"bucket = \"telemetry-parquet\"\n",
"prefix = \"main_summary/v3\"\n",
"%time dataset = sqlContext.read.load(\"s3://{}/{}\".format(bucket, prefix), \"parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- document_id: string (nullable = true)\n",
" |-- client_id: string (nullable = true)\n",
" |-- channel: string (nullable = true)\n",
" |-- normalized_channel: string (nullable = true)\n",
" |-- country: string (nullable = true)\n",
" |-- city: string (nullable = true)\n",
" |-- os: string (nullable = true)\n",
" |-- os_version: string (nullable = true)\n",
" |-- os_service_pack_major: string (nullable = true)\n",
" |-- os_service_pack_minor: string (nullable = true)\n",
" |-- profile_creation_date: long (nullable = true)\n",
" |-- subsession_start_date: string (nullable = true)\n",
" |-- subsession_length: long (nullable = true)\n",
" |-- distribution_id: string (nullable = true)\n",
" |-- submission_date: string (nullable = true)\n",
" |-- sync_configured: boolean (nullable = true)\n",
" |-- sync_count_desktop: integer (nullable = true)\n",
" |-- sync_count_mobile: integer (nullable = true)\n",
" |-- app_build_id: string (nullable = true)\n",
" |-- app_display_version: string (nullable = true)\n",
" |-- app_name: string (nullable = true)\n",
" |-- app_version: string (nullable = true)\n",
" |-- timestamp: long (nullable = true)\n",
" |-- env_build_id: string (nullable = true)\n",
" |-- env_build_version: string (nullable = true)\n",
" |-- env_build_arch: string (nullable = true)\n",
" |-- e10s_enabled: boolean (nullable = true)\n",
" |-- e10s_cohort: string (nullable = true)\n",
" |-- locale: string (nullable = true)\n",
" |-- active_experiment_id: string (nullable = true)\n",
" |-- active_experiment_branch: string (nullable = true)\n",
" |-- reason: string (nullable = true)\n",
" |-- timezone_offset: integer (nullable = true)\n",
" |-- plugin_hangs: integer (nullable = true)\n",
" |-- aborts_plugin: integer (nullable = true)\n",
" |-- aborts_content: integer (nullable = true)\n",
" |-- aborts_gmplugin: integer (nullable = true)\n",
" |-- crashes_detected_plugin: integer (nullable = true)\n",
" |-- crashes_detected_content: integer (nullable = true)\n",
" |-- crashes_detected_gmplugin: integer (nullable = true)\n",
" |-- crash_submit_attempt_main: integer (nullable = true)\n",
" |-- crash_submit_attempt_content: integer (nullable = true)\n",
" |-- crash_submit_attempt_plugin: integer (nullable = true)\n",
" |-- crash_submit_success_main: integer (nullable = true)\n",
" |-- crash_submit_success_content: integer (nullable = true)\n",
" |-- crash_submit_success_plugin: integer (nullable = true)\n",
" |-- active_addons_count: long (nullable = true)\n",
" |-- flash_version: string (nullable = true)\n",
" |-- vendor: string (nullable = true)\n",
" |-- is_default_browser: boolean (nullable = true)\n",
" |-- default_search_engine_data_name: string (nullable = true)\n",
" |-- default_search_engine: string (nullable = true)\n",
" |-- loop_activity_counter: struct (nullable = true)\n",
" | |-- open_panel: integer (nullable = true)\n",
" | |-- open_conversation: integer (nullable = true)\n",
" | |-- room_open: integer (nullable = true)\n",
" | |-- room_share: integer (nullable = true)\n",
" | |-- room_delete: integer (nullable = true)\n",
" |-- devtools_toolbox_opened_count: integer (nullable = true)\n",
" |-- client_submission_date: string (nullable = true)\n",
" |-- places_bookmarks_count: integer (nullable = true)\n",
" |-- places_pages_count: integer (nullable = true)\n",
" |-- push_api_notification_received: integer (nullable = true)\n",
" |-- web_notification_shown: integer (nullable = true)\n",
" |-- popup_notification_stats: map (nullable = true)\n",
" | |-- key: string\n",
" | |-- value: struct (valueContainsNull = true)\n",
" | | |-- offered: integer (nullable = true)\n",
" | | |-- action_1: integer (nullable = true)\n",
" | | |-- action_2: integer (nullable = true)\n",
" | | |-- action_3: integer (nullable = true)\n",
" | | |-- action_last: integer (nullable = true)\n",
" | | |-- dismissal_click_elsewhere: integer (nullable = true)\n",
" | | |-- dismissal_leave_page: integer (nullable = true)\n",
" | | |-- dismissal_close_button: integer (nullable = true)\n",
" | | |-- dismissal_not_now: integer (nullable = true)\n",
" | | |-- open_submenu: integer (nullable = true)\n",
" | | |-- learn_more: integer (nullable = true)\n",
" | | |-- reopen_offered: integer (nullable = true)\n",
" | | |-- reopen_action_1: integer (nullable = true)\n",
" | | |-- reopen_action_2: integer (nullable = true)\n",
" | | |-- reopen_action_3: integer (nullable = true)\n",
" | | |-- reopen_action_last: integer (nullable = true)\n",
" | | |-- reopen_dismissal_click_elsewhere: integer (nullable = true)\n",
" | | |-- reopen_dismissal_leave_page: integer (nullable = true)\n",
" | | |-- reopen_dismissal_close_button: integer (nullable = true)\n",
" | | |-- reopen_dismissal_not_now: integer (nullable = true)\n",
" | | |-- reopen_open_submenu: integer (nullable = true)\n",
" | | |-- reopen_learn_more: integer (nullable = true)\n",
" |-- search_counts: array (nullable = true)\n",
" | |-- element: struct (containsNull = true)\n",
" | | |-- engine: string (nullable = true)\n",
" | | |-- source: string (nullable = true)\n",
" | | |-- count: long (nullable = true)\n",
" |-- submission_date_s3: string (nullable = true)\n",
" |-- sample_id: string (nullable = true)\n",
"\n"
]
}
],
"source": [
"dataset.printSchema()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Example query:\n",
"#### How many unique clientIds took part in each Telemetry Experiment, reporting data between July 4 and July 7?\n",
"\n",
"First, filter for the target time range"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 16 ms, sys: 8 ms, total: 24 ms\n",
"Wall time: 120 ms\n"
]
}
],
"source": [
"%time dataset = dataset.filter(dataset.submission_date_s3 >= '20160704').filter(dataset.submission_date_s3 <= '20160706')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Then filter for non-null `active_experiment_id`"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 12 ms, sys: 12 ms, total: 24 ms\n",
"Wall time: 95.6 ms\n"
]
}
],
"source": [
"%time experiments = dataset.filter(dataset.active_experiment_id.isNotNull()).select(\"active_experiment_id\", \"client_id\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now group by experiment and count the unique `client_id`s"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 28 ms, sys: 8 ms, total: 36 ms\n",
"Wall time: 2min 15s\n"
]
}
],
"source": [
"from pyspark.sql.functions import countDistinct\n",
"%time grouped = experiments.groupby(\"active_experiment_id\").agg(countDistinct(experiments.client_id)).collect()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[Row(active_experiment_id=u'displayport-tuning-nightly@experiments.mozilla.org', count(client_id)=9),\n",
" Row(active_experiment_id=u'unified-urlbar@experiments.mozilla.org', count(client_id)=322),\n",
" Row(active_experiment_id=u'plugin-block-beta47@experiments.mozilla.org', count(client_id)=449),\n",
" Row(active_experiment_id=u'e10s-beta45-withaddons@experiments.mozilla.org', count(client_id)=250),\n",
" Row(active_experiment_id=u'e10s-beta46-noapz@experiments.mozilla.org', count(client_id)=271),\n",
" Row(active_experiment_id=u'e10s-enabled-beta-20151214@experiments.mozilla.org', count(client_id)=83),\n",
" Row(active_experiment_id=u'e10s-enabled-aurora-20151020@experiments.mozilla.org', count(client_id)=13),\n",
" Row(active_experiment_id=u'e10s-beta45-withoutaddons@experiments.mozilla.org', count(client_id)=350)]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grouped"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
},
"widgets": {
"state": {},
"version": "1.1.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
# coding: utf-8
# # MainSummary Dataset example
#
# Documentation can be found [here](https://github.com/mozilla/telemetry-batch-view/blob/master/docs/MainSummary.md). Dataset generation code is [here](https://github.com/mozilla/telemetry-batch-view/blob/master/src/main/scala/com/mozilla/telemetry/views/MainSummaryView.scala).
#
# Details and requirements are in [Bug 1260847](https://bugzilla.mozilla.org/show_bug.cgi?id=1260847). See also [Bug 1254716](https://bugzilla.mozilla.org/show_bug.cgi?id=1254716).
# In[1]:
# How many cores are we running on?
sc.defaultParallelism
# ### Read source data
#
# Read the data from the parquet datastore on S3.
# In[2]:
from pyspark.sql import SQLContext
from pyspark.sql.types import *
bucket = "telemetry-parquet"
prefix = "main_summary/v3"
get_ipython().magic(u'time dataset = sqlContext.read.load("s3://{}/{}".format(bucket, prefix), "parquet")')
# In[3]:
dataset.printSchema()
# # Example query:
# #### How many unique clientIds took part in each Telemetry Experiment, reporting data between July 4 and July 7?
#
# First, filter for the target time range
# In[4]:
get_ipython().magic(u"time dataset = dataset.filter(dataset.submission_date_s3 >= '20160704').filter(dataset.submission_date_s3 <= '20160706')")
# Then filter for non-null `active_experiment_id`
# In[5]:
get_ipython().magic(u'time experiments = dataset.filter(dataset.active_experiment_id.isNotNull()).select("active_experiment_id", "client_id")')
# Now group by experiment and count the unique `client_id`s
# In[6]:
from pyspark.sql.functions import countDistinct
get_ipython().magic(u'time grouped = experiments.groupby("active_experiment_id").agg(countDistinct(experiments.client_id)).collect()')
# In[7]:
grouped
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment