Skip to content

Instantly share code, notes, and snippets.

@Dexterp37
Created May 2, 2017 09:06
Show Gist options
  • Save Dexterp37/c0dd82374b49cf17539ded0e680af585 to your computer and use it in GitHub Desktop.
Save Dexterp37/c0dd82374b49cf17539ded0e680af585 to your computer and use it in GitHub Desktop.
histogram-empty-key-counts
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Find histograms with empty keys"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Find histograms with empty keys"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/mnt/anaconda2/lib/python2.7/site-packages/IPython/core/magics/pylab.py:161: UserWarning:\n",
"\n",
"pylab import has clobbered these variables: ['Figure', 'Annotation']\n",
"`%matplotlib` prevents importing * from pylab and numpy\n",
"\n"
]
}
],
"source": [
"import ujson as json\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"import plotly.plotly as py\n",
"from plotly.graph_objs import *\n",
"import IPython\n",
"\n",
"from moztelemetry import Dataset, get_pings_properties, get_one_ping_per_client\n",
"from pprint import pprint\n",
"\n",
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"channels = [\"nightly\", \"aurora\", \"beta\", \"release\"]\n",
"build_info = {\n",
" \"nightly\": {\n",
" \"fraction\": 0.1,\n",
" \"build_ids\": lambda x: x >= \"20170422\"\n",
" },\n",
" \"aurora\": {\n",
" \"fraction\": 0.1,\n",
" \"build_ids\": lambda x: True\n",
" },\n",
" \"beta\": {\n",
" \"fraction\": 0.1,\n",
" \"build_ids\": lambda x: True\n",
" },\n",
" \"release\": {\n",
" \"fraction\": 0.003,\n",
" \"build_ids\": lambda x: x >= \"20161104\"\n",
" }\n",
"}\n",
"pings = {}\n",
"\n",
"for c in channels:\n",
" pings[c] = Dataset.from_source(\"telemetry\") \\\n",
" .where(docType=\"main\") \\\n",
" .where(appUpdateChannel=c) \\\n",
" .where(submissionDate=lambda x: \"20170422\" <= x <= \"20170425\") \\\n",
" .where(appBuildId=build_info[c].get(\"build_ids\")) \\\n",
" .where(sourceVersion=\"4\") \\\n",
" .records(sc, sample=build_info[c].get(\"fraction\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings)."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def get_keyed_histograms(p):\n",
" if not isinstance(p, dict) or \\\n",
" \"payload\" not in p or \\\n",
" not isinstance(p[\"payload\"], dict) or \\\n",
" \"keyedHistograms\" not in p[\"payload\"] or \\\n",
" not isinstance(p[\"payload\"][\"keyedHistograms\"], dict):\n",
" return {}\n",
" return p.get(\"payload\", {}).get(\"keyedHistograms\", {})\n",
"\n",
"# This extracts the keyed histograms names which have an empty key string.\n",
"def extract_affected_histograms(p):\n",
" khs = get_keyed_histograms(p)\n",
" names = [name for name,kh in khs.iteritems() if \"\" in kh]\n",
" return names\n",
"\n",
"extracts = {}\n",
"\n",
"for c,ps in pings.iteritems():\n",
" extracts[c] = ps.flatMap(extract_affected_histograms)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's get sorted lists of the hit counts per channel."
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"nameCounts = {}\n",
"for channel,names in extracts.iteritems():\n",
" counts = names.countByValue()\n",
" nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"nightly\n",
"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th># of hits in nightly</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR</th>\n",
" <td>14375</td>\n",
" </tr>\n",
" <tr>\n",
" <th>URLCLASSIFIER_UPDATE_SERVER_RESPONSE_TIME</th>\n",
" <td>14362</td>\n",
" </tr>\n",
" <tr>\n",
" <th>URLCLASSIFIER_UPDATE_REMOTE_STATUS2</th>\n",
" <td>14322</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" # of hits in nightly\n",
"URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR 14375\n",
"URLCLASSIFIER_UPDATE_SERVER_RESPONSE_TIME 14362\n",
"URLCLASSIFIER_UPDATE_REMOTE_STATUS2 14322"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"aurora\n",
"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th># of hits in aurora</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n",
" <td>5220</td>\n",
" </tr>\n",
" <tr>\n",
" <th>URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR</th>\n",
" <td>58</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>URLCLASSIFIER_UPDATE_REMOTE_STATUS2</th>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CANVAS_WEBGL_FAILURE_ID</th>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" # of hits in aurora\n",
"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 5220\n",
"URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR 58\n",
"CANVAS_WEBGL_ACCL_FAILURE_ID 7\n",
"URLCLASSIFIER_UPDATE_REMOTE_STATUS2 2\n",
"CANVAS_WEBGL_FAILURE_ID 1"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"beta\n",
"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th># of hits in beta</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n",
" <td>5852</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n",
" <td>4282</td>\n",
" </tr>\n",
" <tr>\n",
" <th>URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR</th>\n",
" <td>1255</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CANVAS_WEBGL_FAILURE_ID</th>\n",
" <td>35</td>\n",
" </tr>\n",
" <tr>\n",
" <th>FX_MIGRATION_ERRORS</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>URLCLASSIFIER_UPDATE_REMOTE_STATUS2</th>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" # of hits in beta\n",
"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 5852\n",
"CANVAS_WEBGL_ACCL_FAILURE_ID 4282\n",
"URLCLASSIFIER_UPDATE_REMOTE_NETWORK_ERROR 1255\n",
"CANVAS_WEBGL_FAILURE_ID 35\n",
"FX_MIGRATION_ERRORS 1\n",
"URLCLASSIFIER_UPDATE_REMOTE_STATUS2 1"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"release\n",
"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th># of hits in release</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n",
" <td>104</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CANVAS_WEBGL_FAILURE_ID</th>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" # of hits in release\n",
"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 104\n",
"CANVAS_WEBGL_FAILURE_ID 2"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for channel in channels:\n",
" df = pd.DataFrame([x for _,x in nameCounts[channel]],\n",
" [x for x,_ in nameCounts[channel]])\n",
" print \"\\n\" + channel + \"\\n\"\n",
" df.columns = [\"# of hits in \" + channel]\n",
" IPython.display.display(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
# coding: utf-8
# # Find histograms with empty keys
# ### Find histograms with empty keys
# In[4]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from plotly.graph_objs import *
import IPython
from moztelemetry import Dataset, get_pings_properties, get_one_ping_per_client
from pprint import pprint
get_ipython().magic(u'pylab inline')
# In[31]:
channels = ["nightly", "aurora", "beta", "release"]
build_info = {
"nightly": {
"fraction": 0.1,
"build_ids": lambda x: x >= "20170422"
},
"aurora": {
"fraction": 0.1,
"build_ids": lambda x: True
},
"beta": {
"fraction": 0.1,
"build_ids": lambda x: True
},
"release": {
"fraction": 0.003,
"build_ids": lambda x: x >= "20161104"
}
}
pings = {}
for c in channels:
pings[c] = Dataset.from_source("telemetry") .where(docType="main") .where(appUpdateChannel=c) .where(submissionDate=lambda x: "20170422" <= x <= "20170425") .where(appBuildId=build_info[c].get("build_ids")) .where(sourceVersion="4") .records(sc, sample=build_info[c].get("fraction"))
# ... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings).
# In[33]:
def get_keyed_histograms(p):
if not isinstance(p, dict) or "payload" not in p or not isinstance(p["payload"], dict) or "keyedHistograms" not in p["payload"] or not isinstance(p["payload"]["keyedHistograms"], dict):
return {}
return p.get("payload", {}).get("keyedHistograms", {})
# This extracts the keyed histograms names which have an empty key string.
def extract_affected_histograms(p):
khs = get_keyed_histograms(p)
names = [name for name,kh in khs.iteritems() if "" in kh]
return names
extracts = {}
for c,ps in pings.iteritems():
extracts[c] = ps.flatMap(extract_affected_histograms)
# Let's get sorted lists of the hit counts per channel.
# In[34]:
nameCounts = {}
for channel,names in extracts.iteritems():
counts = names.countByValue()
nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)
# In[35]:
for channel in channels:
df = pd.DataFrame([x for _,x in nameCounts[channel]],
[x for x,_ in nameCounts[channel]])
print "\n" + channel + "\n"
df.columns = ["# of hits in " + channel]
IPython.display.display(df)
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment