-
-
Save Dexterp37/c0dd82374b49cf17539ded0e680af585 to your computer and use it in GitHub Desktop.
histogram-empty-key-counts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# # Find histograms with empty keys | |
# ### Find histograms with empty keys | |
# In[4]: | |
import ujson as json | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
from plotly.graph_objs import * | |
import IPython | |
from moztelemetry import Dataset, get_pings_properties, get_one_ping_per_client | |
from pprint import pprint | |
get_ipython().magic(u'pylab inline') | |
# In[31]: | |
channels = ["nightly", "aurora", "beta", "release"] | |
build_info = { | |
"nightly": { | |
"fraction": 0.1, | |
"build_ids": lambda x: x >= "20170422" | |
}, | |
"aurora": { | |
"fraction": 0.1, | |
"build_ids": lambda x: True | |
}, | |
"beta": { | |
"fraction": 0.1, | |
"build_ids": lambda x: True | |
}, | |
"release": { | |
"fraction": 0.003, | |
"build_ids": lambda x: x >= "20161104" | |
} | |
} | |
pings = {} | |
for c in channels: | |
pings[c] = Dataset.from_source("telemetry") .where(docType="main") .where(appUpdateChannel=c) .where(submissionDate=lambda x: "20170422" <= x <= "20170425") .where(appBuildId=build_info[c].get("build_ids")) .where(sourceVersion="4") .records(sc, sample=build_info[c].get("fraction")) | |
# ... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings). | |
# In[33]: | |
def get_keyed_histograms(p): | |
if not isinstance(p, dict) or "payload" not in p or not isinstance(p["payload"], dict) or "keyedHistograms" not in p["payload"] or not isinstance(p["payload"]["keyedHistograms"], dict): | |
return {} | |
return p.get("payload", {}).get("keyedHistograms", {}) | |
# This extracts the keyed histograms names which have an empty key string. | |
def extract_affected_histograms(p): | |
khs = get_keyed_histograms(p) | |
names = [name for name,kh in khs.iteritems() if "" in kh] | |
return names | |
extracts = {} | |
for c,ps in pings.iteritems(): | |
extracts[c] = ps.flatMap(extract_affected_histograms) | |
# Let's get sorted lists of the hit counts per channel. | |
# In[34]: | |
nameCounts = {} | |
for channel,names in extracts.iteritems(): | |
counts = names.countByValue() | |
nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True) | |
# In[35]: | |
for channel in channels: | |
df = pd.DataFrame([x for _,x in nameCounts[channel]], | |
[x for x,_ in nameCounts[channel]]) | |
print "\n" + channel + "\n" | |
df.columns = ["# of hits in " + channel] | |
IPython.display.display(df) | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment