Dexterp37/histogram-empty-key-counts.ipynb Secret

## histogram-empty-key-counts.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Find histograms with empty keys"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Find histograms with empty keys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Populating the interactive namespace from numpy and matplotlib\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/anaconda2/lib/python2.7/site-packages/IPython/core/magics/pylab.py:161: UserWarning:\n",
      "\n",
      "pylab import has clobbered these variables: ['Figure', 'Annotation']\n",
      "`%matplotlib` prevents importing * from pylab and numpy\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import ujson as json\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import plotly.plotly as py\n",
    "from plotly.graph_objs import *\n",
    "import IPython\n",
    "\n",
    "from moztelemetry import Dataset, get_pings_properties, get_one_ping_per_client\n",
    "from pprint import pprint\n",
    "\n",
    "%pylab inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "channels = [\"nightly\", \"aurora\", \"beta\", \"release\"]\n",
    "build_info = {\n",
    "    \"nightly\": {\n",
    "        \"fraction\": 0.1,\n",
    "        \"build_ids\": lambda x: x >= \"20170422\"\n",
    "    },\n",
    "    \"aurora\": {\n",
    "        \"fraction\": 0.1,\n",
    "        \"build_ids\": lambda x: True\n",
    "    },\n",
    "    \"beta\": {\n",
    "        \"fraction\": 0.1,\n",
    "        \"build_ids\": lambda x: True\n",
    "    },\n",
    "    \"release\": {\n",
    "        \"fraction\": 0.003,\n",
    "        \"build_ids\": lambda x: x >= \"20161104\"\n",
    "    }\n",
    "}\n",
    "pings = {}\n",
    "\n",
    "for c in channels:\n",
    "    pings[c] = Dataset.from_source(\"telemetry\") \\\n",
    "                      .where(docType=\"main\") \\\n",
    "                      .where(appUpdateChannel=c) \\\n",
    "                      .where(submissionDate=lambda x: \"20170422\" <= x <= \"20170425\") \\\n",
    "                      .where(appBuildId=build_info[c].get(\"build_ids\")) \\\n",
    "                      .where(sourceVersion=\"4\") \\\n",
    "                      .records(sc, sample=build_info[c].get(\"fraction\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_keyed_histograms(p):\n",
    "    if not isinstance(p, dict) or \\\n",
    "       \"payload\" not in p or \\\n",
    "       not isinstance(p[\"payload\"], dict) or \\\n",
    "       \"keyedHistograms\" not in p[\"payload\"] or \\\n",
    "       not isinstance(p[\"payload\"][\"keyedHistograms\"], dict):\n",
    "        return {}\n",
    "    return p.get(\"payload\", {}).get(\"keyedHistograms\", {})\n",
    "\n",
    "# This extracts the keyed histograms names which have an empty key string.\n",
    "def extract_affected_histograms(p):\n",
    "    khs = get_keyed_histograms(p)\n",
    "    names = [name for name,kh in khs.iteritems() if \"\" in kh]\n",
    "    return names\n",
    "\n",
    "extracts = {}\n",
    "\n",
    "for c,ps in pings.iteritems():\n",
    "    extracts[c] = ps.flatMap(extract_affected_histograms)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's get sorted lists of the hit counts per channel."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "nameCounts = {}\n",
    "for channel,names in extracts.iteritems():\n",
    "    counts = names.countByValue()\n",
    "    nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "nightly\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th># of hits in nightly</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR</th>\n",
       "      <td>14375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>URLCLASSIFIER_UPDATE_SERVER_RESPONSE_TIME</th>\n",
       "      <td>14362</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>URLCLASSIFIER_UPDATE_REMOTE_STATUS2</th>\n",
       "      <td>14322</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           # of hits in nightly\n",
       "URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR                 14375\n",
       "URLCLASSIFIER_UPDATE_SERVER_RESPONSE_TIME                 14362\n",
       "URLCLASSIFIER_UPDATE_REMOTE_STATUS2                       14322"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "aurora\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th># of hits in aurora</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n",
       "      <td>5220</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR</th>\n",
       "      <td>58</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>URLCLASSIFIER_UPDATE_REMOTE_STATUS2</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CANVAS_WEBGL_FAILURE_ID</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           # of hits in aurora\n",
       "DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED                       5220\n",
       "URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR                   58\n",
       "CANVAS_WEBGL_ACCL_FAILURE_ID                                 7\n",
       "URLCLASSIFIER_UPDATE_REMOTE_STATUS2                          2\n",
       "CANVAS_WEBGL_FAILURE_ID                                      1"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "beta\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th># of hits in beta</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n",
       "      <td>5852</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n",
       "      <td>4282</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR</th>\n",
       "      <td>1255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CANVAS_WEBGL_FAILURE_ID</th>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FX_MIGRATION_ERRORS</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>URLCLASSIFIER_UPDATE_REMOTE_STATUS2</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           # of hits in beta\n",
       "DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED                     5852\n",
       "CANVAS_WEBGL_ACCL_FAILURE_ID                            4282\n",
       "URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR               1255\n",
       "CANVAS_WEBGL_FAILURE_ID                                   35\n",
       "FX_MIGRATION_ERRORS                                        1\n",
       "URLCLASSIFIER_UPDATE_REMOTE_STATUS2                        1"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "release\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th># of hits in release</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CANVAS_WEBGL_FAILURE_ID</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     # of hits in release\n",
       "DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED                   104\n",
       "CANVAS_WEBGL_FAILURE_ID                                 2"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for channel in channels:\n",
    "    df = pd.DataFrame([x for _,x in  nameCounts[channel]],\n",
    "                      [x for x,_ in  nameCounts[channel]])\n",
    "    print \"\\n\" + channel + \"\\n\"\n",
    "    df.columns = [\"# of hits in \" + channel]\n",
    "    IPython.display.display(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

## histogram-empty-key-counts.py

# coding: utf-8

# # Find histograms with empty keys

# ### Find histograms with empty keys

# In[4]:

import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from plotly.graph_objs import *
import IPython

from moztelemetry import Dataset, get_pings_properties, get_one_ping_per_client
from pprint import pprint

get_ipython().magic(u'pylab inline')


# In[31]:

channels = ["nightly", "aurora", "beta", "release"]
build_info = {
    "nightly": {
        "fraction": 0.1,
        "build_ids": lambda x: x >= "20170422"
    },
    "aurora": {
        "fraction": 0.1,
        "build_ids": lambda x: True
    },
    "beta": {
        "fraction": 0.1,
        "build_ids": lambda x: True
    },
    "release": {
        "fraction": 0.003,
        "build_ids": lambda x: x >= "20161104"
    }
}
pings = {}

for c in channels:
    pings[c] = Dataset.from_source("telemetry")                       .where(docType="main")                       .where(appUpdateChannel=c)                       .where(submissionDate=lambda x: "20170422" <= x <= "20170425")                       .where(appBuildId=build_info[c].get("build_ids"))                       .where(sourceVersion="4")                       .records(sc, sample=build_info[c].get("fraction"))


# ... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings).

# In[33]:

def get_keyed_histograms(p):
    if not isinstance(p, dict) or        "payload" not in p or        not isinstance(p["payload"], dict) or        "keyedHistograms" not in p["payload"] or        not isinstance(p["payload"]["keyedHistograms"], dict):
        return {}
    return p.get("payload", {}).get("keyedHistograms", {})

# This extracts the keyed histograms names which have an empty key string.
def extract_affected_histograms(p):
    khs = get_keyed_histograms(p)
    names = [name for name,kh in khs.iteritems() if "" in kh]
    return names

extracts = {}

for c,ps in pings.iteritems():
    extracts[c] = ps.flatMap(extract_affected_histograms)


# Let's get sorted lists of the hit counts per channel.

# In[34]:

nameCounts = {}
for channel,names in extracts.iteritems():
    counts = names.countByValue()
    nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)


# In[35]:

for channel in channels:
    df = pd.DataFrame([x for _,x in  nameCounts[channel]],
                      [x for x,_ in  nameCounts[channel]])
    print "\n" + channel + "\n"
    df.columns = ["# of hits in " + channel]
    IPython.display.display(df)


# In[ ]:
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Find histograms with empty keys"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Find histograms with empty keys"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Populating the interactive namespace from numpy and matplotlib\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/mnt/anaconda2/lib/python2.7/site-packages/IPython/core/magics/pylab.py:161: UserWarning:\n",
	"\n",
	"pylab import has clobbered these variables: ['Figure', 'Annotation']\n",
	"`%matplotlib` prevents importing * from pylab and numpy\n",
	"\n"
	]
	}
	],
	"source": [
	"import ujson as json\n",
	"import matplotlib.pyplot as plt\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"import plotly.plotly as py\n",
	"from plotly.graph_objs import *\n",
	"import IPython\n",
	"\n",
	"from moztelemetry import Dataset, get_pings_properties, get_one_ping_per_client\n",
	"from pprint import pprint\n",
	"\n",
	"%pylab inline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"channels = [\"nightly\", \"aurora\", \"beta\", \"release\"]\n",
	"build_info = {\n",
	" \"nightly\": {\n",
	" \"fraction\": 0.1,\n",
	" \"build_ids\": lambda x: x >= \"20170422\"\n",
	" },\n",
	" \"aurora\": {\n",
	" \"fraction\": 0.1,\n",
	" \"build_ids\": lambda x: True\n",
	" },\n",
	" \"beta\": {\n",
	" \"fraction\": 0.1,\n",
	" \"build_ids\": lambda x: True\n",
	" },\n",
	" \"release\": {\n",
	" \"fraction\": 0.003,\n",
	" \"build_ids\": lambda x: x >= \"20161104\"\n",
	" }\n",
	"}\n",
	"pings = {}\n",
	"\n",
	"for c in channels:\n",
	" pings[c] = Dataset.from_source(\"telemetry\") \\\n",
	" .where(docType=\"main\") \\\n",
	" .where(appUpdateChannel=c) \\\n",
	" .where(submissionDate=lambda x: \"20170422\" <= x <= \"20170425\") \\\n",
	" .where(appBuildId=build_info[c].get(\"build_ids\")) \\\n",
	" .where(sourceVersion=\"4\") \\\n",
	" .records(sc, sample=build_info[c].get(\"fraction\"))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings)."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def get_keyed_histograms(p):\n",
	" if not isinstance(p, dict) or \\\n",
	" \"payload\" not in p or \\\n",
	" not isinstance(p[\"payload\"], dict) or \\\n",
	" \"keyedHistograms\" not in p[\"payload\"] or \\\n",
	" not isinstance(p[\"payload\"][\"keyedHistograms\"], dict):\n",
	" return {}\n",
	" return p.get(\"payload\", {}).get(\"keyedHistograms\", {})\n",
	"\n",
	"# This extracts the keyed histograms names which have an empty key string.\n",
	"def extract_affected_histograms(p):\n",
	" khs = get_keyed_histograms(p)\n",
	" names = [name for name,kh in khs.iteritems() if \"\" in kh]\n",
	" return names\n",
	"\n",
	"extracts = {}\n",
	"\n",
	"for c,ps in pings.iteritems():\n",
	" extracts[c] = ps.flatMap(extract_affected_histograms)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Let's get sorted lists of the hit counts per channel."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"nameCounts = {}\n",
	"for channel,names in extracts.iteritems():\n",
	" counts = names.countByValue()\n",
	" nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\n",
	"nightly\n",
	"\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th># of hits in nightly</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR</th>\n",
	" <td>14375</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>URLCLASSIFIER_UPDATE_SERVER_RESPONSE_TIME</th>\n",
	" <td>14362</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>URLCLASSIFIER_UPDATE_REMOTE_STATUS2</th>\n",
	" <td>14322</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" # of hits in nightly\n",
	"URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR 14375\n",
	"URLCLASSIFIER_UPDATE_SERVER_RESPONSE_TIME 14362\n",
	"URLCLASSIFIER_UPDATE_REMOTE_STATUS2 14322"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\n",
	"aurora\n",
	"\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th># of hits in aurora</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n",
	" <td>5220</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR</th>\n",
	" <td>58</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n",
	" <td>7</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>URLCLASSIFIER_UPDATE_REMOTE_STATUS2</th>\n",
	" <td>2</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>CANVAS_WEBGL_FAILURE_ID</th>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" # of hits in aurora\n",
	"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 5220\n",
	"URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR 58\n",
	"CANVAS_WEBGL_ACCL_FAILURE_ID 7\n",
	"URLCLASSIFIER_UPDATE_REMOTE_STATUS2 2\n",
	"CANVAS_WEBGL_FAILURE_ID 1"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\n",
	"beta\n",
	"\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th># of hits in beta</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n",
	" <td>5852</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n",
	" <td>4282</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR</th>\n",
	" <td>1255</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>CANVAS_WEBGL_FAILURE_ID</th>\n",
	" <td>35</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>FX_MIGRATION_ERRORS</th>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>URLCLASSIFIER_UPDATE_REMOTE_STATUS2</th>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" # of hits in beta\n",
	"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 5852\n",
	"CANVAS_WEBGL_ACCL_FAILURE_ID 4282\n",
	"URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR 1255\n",
	"CANVAS_WEBGL_FAILURE_ID 35\n",
	"FX_MIGRATION_ERRORS 1\n",
	"URLCLASSIFIER_UPDATE_REMOTE_STATUS2 1"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\n",
	"release\n",
	"\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th># of hits in release</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n",
	" <td>104</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>CANVAS_WEBGL_FAILURE_ID</th>\n",
	" <td>2</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" # of hits in release\n",
	"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 104\n",
	"CANVAS_WEBGL_FAILURE_ID 2"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"for channel in channels:\n",
	" df = pd.DataFrame([x for _,x in nameCounts[channel]],\n",
	" [x for x,_ in nameCounts[channel]])\n",
	" print \"\\n\" + channel + \"\\n\"\n",
	" df.columns = [\"# of hits in \" + channel]\n",
	" IPython.display.display(df)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [default]",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}

	# coding: utf-8

	# # Find histograms with empty keys

	# ### Find histograms with empty keys

	# In[4]:

	import ujson as json
	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import plotly.plotly as py
	from plotly.graph_objs import *
	import IPython

	from moztelemetry import Dataset, get_pings_properties, get_one_ping_per_client
	from pprint import pprint

	get_ipython().magic(u'pylab inline')


	# In[31]:

	channels = ["nightly", "aurora", "beta", "release"]
	build_info = {
	"nightly": {
	"fraction": 0.1,
	"build_ids": lambda x: x >= "20170422"
	},
	"aurora": {
	"fraction": 0.1,
	"build_ids": lambda x: True
	},
	"beta": {
	"fraction": 0.1,
	"build_ids": lambda x: True
	},
	"release": {
	"fraction": 0.003,
	"build_ids": lambda x: x >= "20161104"
	}
	}
	pings = {}

	for c in channels:
	pings[c] = Dataset.from_source("telemetry") .where(docType="main") .where(appUpdateChannel=c) .where(submissionDate=lambda x: "20170422" <= x <= "20170425") .where(appBuildId=build_info[c].get("build_ids")) .where(sourceVersion="4") .records(sc, sample=build_info[c].get("fraction"))


	# ... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings).

	# In[33]:

	def get_keyed_histograms(p):
	if not isinstance(p, dict) or "payload" not in p or not isinstance(p["payload"], dict) or "keyedHistograms" not in p["payload"] or not isinstance(p["payload"]["keyedHistograms"], dict):
	return {}
	return p.get("payload", {}).get("keyedHistograms", {})

	# This extracts the keyed histograms names which have an empty key string.
	def extract_affected_histograms(p):
	khs = get_keyed_histograms(p)
	names = [name for name,kh in khs.iteritems() if "" in kh]
	return names

	extracts = {}

	for c,ps in pings.iteritems():
	extracts[c] = ps.flatMap(extract_affected_histograms)


	# Let's get sorted lists of the hit counts per channel.

	# In[34]:

	nameCounts = {}
	for channel,names in extracts.iteritems():
	counts = names.countByValue()
	nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)


	# In[35]:

	for channel in channels:
	df = pd.DataFrame([x for _,x in nameCounts[channel]],
	[x for x,_ in nameCounts[channel]])
	print "\n" + channel + "\n"
	df.columns = ["# of hits in " + channel]
	IPython.display.display(df)


	# In[ ]: