Skip to content

Instantly share code, notes, and snippets.

@georgf
Last active February 24, 2017 11:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save georgf/94ca77fe6174ec07077504b24379932a to your computer and use it in GitHub Desktop.
Save georgf/94ca77fe6174ec07077504b24379932a to your computer and use it in GitHub Desktop.
histogram-empty-key-counts
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Find histograms with empty keys"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Find histograms with empty keys"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/hadoop/anaconda2/lib/python2.7/site-packages/IPython/core/magics/pylab.py:161: UserWarning:\n",
"\n",
"pylab import has clobbered these variables: ['Figure', 'Annotation']\n",
"`%matplotlib` prevents importing * from pylab and numpy\n",
"\n"
]
}
],
"source": [
"import ujson as json\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"import plotly.plotly as py\n",
"from plotly.graph_objs import *\n",
"import IPython\n",
"\n",
"from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client\n",
"from pprint import pprint\n",
"\n",
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"channels = [\"nightly\", \"aurora\", \"beta\", \"release\"]\n",
"submission_dates = (\"20170122\", \"20170222\")\n",
"fractions = {\n",
" \"nightly\": 0.1,\n",
" \"aurora\": 0.1,\n",
" \"beta\": 0.1,\n",
" \"release\": 0.003\n",
"}\n",
"pings = {}\n",
"\n",
"for c in channels:\n",
" pings[c] = get_pings(sc,\n",
" app=\"Firefox\",\n",
" channel=c,\n",
" doc_type=\"main\",\n",
" fraction=fractions[c],\n",
" submission_date=submission_dates)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings)."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def get_keyed_histograms(p):\n",
" if not isinstance(p, dict) or \\\n",
" \"payload\" not in p or \\\n",
" not isinstance(p[\"payload\"], dict) or \\\n",
" \"keyedHistograms\" not in p[\"payload\"] or \\\n",
" not isinstance(p[\"payload\"][\"keyedHistograms\"], dict):\n",
" return {}\n",
" return p.get(\"payload\", {}).get(\"keyedHistograms\", {})\n",
"\n",
"# This extracts the keyed histograms names which have an empty key string.\n",
"def extract_affected_histograms(p):\n",
" khs = get_keyed_histograms(p)\n",
" names = [name for name,kh in khs.iteritems() if \"\" in kh]\n",
" return names\n",
"\n",
"extracts = {}\n",
"\n",
"for c,ps in pings.iteritems():\n",
" extracts[c] = ps.flatMap(extract_affected_histograms)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's get sorted lists of the hit counts per channel."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"nameCounts = {}\n",
"for channel,names in extracts.iteritems():\n",
" counts = names.countByValue()\n",
" nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"nightly\n",
"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th># of hits in nightly</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n",
" <td>794</td>\n",
" </tr>\n",
" <tr>\n",
" <th>URLCLASSIFIER_UPDATE_REMOTE_STATUS2</th>\n",
" <td>104</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n",
" <td>76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CANVAS_WEBGL_FAILURE_ID</th>\n",
" <td>16</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" # of hits in nightly\n",
"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 794\n",
"URLCLASSIFIER_UPDATE_REMOTE_STATUS2 104\n",
"CANVAS_WEBGL_ACCL_FAILURE_ID 76\n",
"CANVAS_WEBGL_FAILURE_ID 16"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"aurora\n",
"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th># of hits in aurora</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n",
" <td>42422</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n",
" <td>75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CANVAS_WEBGL_FAILURE_ID</th>\n",
" <td>6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" # of hits in aurora\n",
"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 42422\n",
"CANVAS_WEBGL_ACCL_FAILURE_ID 75\n",
"CANVAS_WEBGL_FAILURE_ID 6"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"beta\n",
"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th># of hits in beta</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n",
" <td>75355</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n",
" <td>53192</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CANVAS_WEBGL_FAILURE_ID</th>\n",
" <td>596</td>\n",
" </tr>\n",
" <tr>\n",
" <th>FX_MIGRATION_ERRORS</th>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" # of hits in beta\n",
"CANVAS_WEBGL_ACCL_FAILURE_ID 75355\n",
"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 53192\n",
"CANVAS_WEBGL_FAILURE_ID 596\n",
"FX_MIGRATION_ERRORS 2"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"release\n",
"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th># of hits in release</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n",
" <td>879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n",
" <td>396</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CANVAS_WEBGL_FAILURE_ID</th>\n",
" <td>7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" # of hits in release\n",
"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 879\n",
"CANVAS_WEBGL_ACCL_FAILURE_ID 396\n",
"CANVAS_WEBGL_FAILURE_ID 7"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for channel in channels:\n",
" df = pd.DataFrame([x for _,x in nameCounts[channel]],\n",
" [x for x,_ in nameCounts[channel]])\n",
" print \"\\n\" + channel + \"\\n\"\n",
" df.columns = [\"# of hits in \" + channel]\n",
" IPython.display.display(df)"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
# coding: utf-8
# # Find histograms with empty keys
# ### Find histograms with empty keys
# In[7]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from plotly.graph_objs import *
import IPython
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
from pprint import pprint
get_ipython().magic(u'pylab inline')
# In[8]:
channels = ["nightly", "aurora", "beta", "release"]
submission_dates = ("20170122", "20170222")
fractions = {
"nightly": 0.1,
"aurora": 0.1,
"beta": 0.1,
"release": 0.003
}
pings = {}
for c in channels:
pings[c] = get_pings(sc,
app="Firefox",
channel=c,
doc_type="main",
fraction=fractions[c],
submission_date=submission_dates)
# ... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings).
# In[9]:
def get_keyed_histograms(p):
if not isinstance(p, dict) or "payload" not in p or not isinstance(p["payload"], dict) or "keyedHistograms" not in p["payload"] or not isinstance(p["payload"]["keyedHistograms"], dict):
return {}
return p.get("payload", {}).get("keyedHistograms", {})
# This extracts the keyed histograms names which have an empty key string.
def extract_affected_histograms(p):
khs = get_keyed_histograms(p)
names = [name for name,kh in khs.iteritems() if "" in kh]
return names
extracts = {}
for c,ps in pings.iteritems():
extracts[c] = ps.flatMap(extract_affected_histograms)
# Let's get sorted lists of the hit counts per channel.
# In[10]:
nameCounts = {}
for channel,names in extracts.iteritems():
counts = names.countByValue()
nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)
# In[12]:
for channel in channels:
df = pd.DataFrame([x for _,x in nameCounts[channel]],
[x for x,_ in nameCounts[channel]])
print "\n" + channel + "\n"
df.columns = ["# of hits in " + channel]
IPython.display.display(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment