vitillo/Bug 1165304.ipynb

## Bug 1165304.ipynb
{"nbformat_minor": 0, "cells": [{"source": "### Telemetry Hello World", "cell_type": "markdown", "metadata": {}}, {"source": "This is a very a brief introduction to Spark and Telemetry in Python. You should have a look at the [tutorial](https://gist.github.com/vitillo/25a20b7c8685c0c82422) in Scala and the associated [talk](http://www.slideshare.net/RobertoAgostinoVitil/spark-meets-telemetry) if you are interested to learn more about Spark. ", "cell_type": "markdown", "metadata": {}}, {"execution_count": 1, "cell_type": "code", "source": "import ujson as json\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport plotly.plotly as py\n\nfrom moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client\n\n%pylab inline", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Populating the interactive namespace from numpy and matplotlib\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"source": "### Basics", "cell_type": "markdown", "metadata": {}}, {"source": "The goal of this example is to plot the startup distribution for each OS. Let's see how many parallel workers we have at our disposal:", "cell_type": "markdown", "metadata": {}}, {"execution_count": 2, "cell_type": "code", "source": "sc.defaultParallelism", "outputs": [{"execution_count": 2, "output_type": "execute_result", "data": {"text/plain": "16"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"source": "Let's fetch 10% of Telemetry submissions for a given build-id...", "cell_type": "markdown", "metadata": {}}, {"execution_count": 7, "cell_type": "code", "source": "pings = get_pings(sc, app=\"Firefox\", channel=\"nightly\", submission_date=\"20150526\", fraction=1)", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"source": "... and extract only the attributes we need from the Telemetry submissions:", "cell_type": "markdown", "metadata": {}}, {"execution_count": 8, "cell_type": "code", "source": "subset = get_pings_properties(pings, [\"application/channel\", \"environment\"])", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 9, "cell_type": "code", "source": "subset.filter(lambda p: p[\"application/channel\"] != p[\"environment\"][\"settings\"][\"update\"][\"channel\"]).first()", "outputs": [{"ename": "ValueError", "evalue": "RDD is empty", "traceback": ["\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)", "\u001b[1;32m<ipython-input-9-a5abd61600b0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0msubset\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mp\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mp\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"application/channel\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mp\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"environment\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"settings\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"update\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"channel\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfirst\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m/home/hadoop/spark/python/pyspark/rdd.py\u001b[0m in \u001b[0;36mfirst\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1140\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mrs\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1141\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mrs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1142\u001b[1;33m         \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"RDD is empty\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1143\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1144\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0msaveAsNewAPIHadoopDataset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeyConverter\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalueConverter\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mValueError\u001b[0m: RDD is empty"], "output_type": "error"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.9", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}}