georgf/histogram-empty-key-counts.ipynb

## histogram-empty-key-counts.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              histogram-empty-key-counts.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## histogram-empty-key-counts.py

# coding: utf-8

# # Find histograms with empty keys

# ### Find histograms with empty keys

# In[7]:

import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from plotly.graph_objs import *
import IPython

from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
from pprint import pprint

get_ipython().magic(u'pylab inline')


# In[8]:

channels = ["nightly", "aurora", "beta", "release"]
submission_dates = ("20170122", "20170222")
fractions = {
    "nightly": 0.1,
    "aurora": 0.1,
    "beta": 0.1,
    "release": 0.003
}
pings = {}

for c in channels:
    pings[c] = get_pings(sc,
                      app="Firefox",
                      channel=c,
                      doc_type="main",
                      fraction=fractions[c],
                      submission_date=submission_dates)


# ... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings).

# In[9]:

def get_keyed_histograms(p):
    if not isinstance(p, dict) or        "payload" not in p or        not isinstance(p["payload"], dict) or        "keyedHistograms" not in p["payload"] or        not isinstance(p["payload"]["keyedHistograms"], dict):
        return {}
    return p.get("payload", {}).get("keyedHistograms", {})

# This extracts the keyed histograms names which have an empty key string.
def extract_affected_histograms(p):
    khs = get_keyed_histograms(p)
    names = [name for name,kh in khs.iteritems() if "" in kh]
    return names

extracts = {}

for c,ps in pings.iteritems():
    extracts[c] = ps.flatMap(extract_affected_histograms)


# Let's get sorted lists of the hit counts per channel.

# In[10]:

nameCounts = {}
for channel,names in extracts.iteritems():
    counts = names.countByValue()
    nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)


# In[12]:

for channel in channels:
    df = pd.DataFrame([x for _,x in  nameCounts[channel]],
                      [x for x,_ in  nameCounts[channel]])
    print "\n" + channel + "\n"
    df.columns = ["# of hits in " + channel]
    IPython.display.display(df)

	# coding: utf-8

	# # Find histograms with empty keys

	# ### Find histograms with empty keys

	# In[7]:

	import ujson as json
	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import plotly.plotly as py
	from plotly.graph_objs import *
	import IPython

	from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
	from pprint import pprint

	get_ipython().magic(u'pylab inline')


	# In[8]:

	channels = ["nightly", "aurora", "beta", "release"]
	submission_dates = ("20170122", "20170222")
	fractions = {
	"nightly": 0.1,
	"aurora": 0.1,
	"beta": 0.1,
	"release": 0.003
	}
	pings = {}

	for c in channels:
	pings[c] = get_pings(sc,
	app="Firefox",
	channel=c,
	doc_type="main",
	fraction=fractions[c],
	submission_date=submission_dates)


	# ... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings).

	# In[9]:

	def get_keyed_histograms(p):
	if not isinstance(p, dict) or "payload" not in p or not isinstance(p["payload"], dict) or "keyedHistograms" not in p["payload"] or not isinstance(p["payload"]["keyedHistograms"], dict):
	return {}
	return p.get("payload", {}).get("keyedHistograms", {})

	# This extracts the keyed histograms names which have an empty key string.
	def extract_affected_histograms(p):
	khs = get_keyed_histograms(p)
	names = [name for name,kh in khs.iteritems() if "" in kh]
	return names

	extracts = {}

	for c,ps in pings.iteritems():
	extracts[c] = ps.flatMap(extract_affected_histograms)


	# Let's get sorted lists of the hit counts per channel.

	# In[10]:

	nameCounts = {}
	for channel,names in extracts.iteritems():
	counts = names.countByValue()
	nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)


	# In[12]:

	for channel in channels:
	df = pd.DataFrame([x for _,x in nameCounts[channel]],
	[x for x,_ in nameCounts[channel]])
	print "\n" + channel + "\n"
	df.columns = ["# of hits in " + channel]
	IPython.display.display(df)