Skip to content

Instantly share code, notes, and snippets.

@georgf
Last active February 24, 2017 11:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save georgf/94ca77fe6174ec07077504b24379932a to your computer and use it in GitHub Desktop.
Save georgf/94ca77fe6174ec07077504b24379932a to your computer and use it in GitHub Desktop.
histogram-empty-key-counts
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# # Find histograms with empty keys
# ### Find histograms with empty keys
# In[7]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from plotly.graph_objs import *
import IPython
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
from pprint import pprint
get_ipython().magic(u'pylab inline')
# In[8]:
channels = ["nightly", "aurora", "beta", "release"]
submission_dates = ("20170122", "20170222")
fractions = {
"nightly": 0.1,
"aurora": 0.1,
"beta": 0.1,
"release": 0.003
}
pings = {}
for c in channels:
pings[c] = get_pings(sc,
app="Firefox",
channel=c,
doc_type="main",
fraction=fractions[c],
submission_date=submission_dates)
# ... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings).
# In[9]:
def get_keyed_histograms(p):
if not isinstance(p, dict) or "payload" not in p or not isinstance(p["payload"], dict) or "keyedHistograms" not in p["payload"] or not isinstance(p["payload"]["keyedHistograms"], dict):
return {}
return p.get("payload", {}).get("keyedHistograms", {})
# This extracts the keyed histograms names which have an empty key string.
def extract_affected_histograms(p):
khs = get_keyed_histograms(p)
names = [name for name,kh in khs.iteritems() if "" in kh]
return names
extracts = {}
for c,ps in pings.iteritems():
extracts[c] = ps.flatMap(extract_affected_histograms)
# Let's get sorted lists of the hit counts per channel.
# In[10]:
nameCounts = {}
for channel,names in extracts.iteritems():
counts = names.countByValue()
nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)
# In[12]:
for channel in channels:
df = pd.DataFrame([x for _,x in nameCounts[channel]],
[x for x,_ in nameCounts[channel]])
print "\n" + channel + "\n"
df.columns = ["# of hits in " + channel]
IPython.display.display(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment