Skip to content

Instantly share code, notes, and snippets.

@bill-mccloskey
Last active August 18, 2017 23:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bill-mccloskey/897f18b79adf252a820d875558d76d9e to your computer and use it in GitHub Desktop.
Save bill-mccloskey/897f18b79adf252a820d875558d76d9e to your computer and use it in GitHub Desktop.
runnable-analysis
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# In[1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from plotly.graph_objs import *
from moztelemetry import get_pings_properties, get_one_ping_per_client
from moztelemetry.dataset import Dataset
get_ipython().magic(u'matplotlib inline')
# In[2]:
Dataset.from_source("telemetry").schema
# In[3]:
pings = Dataset.from_source("telemetry") .where(docType='main') .where(appBuildId=lambda x: x.startswith('20170816') or x.startswith('20170817')) .where(appUpdateChannel="nightly") .records(sc, sample=1.0)
# ... and extract only the attributes we need from the Telemetry submissions:
# In[4]:
subset = get_pings_properties(pings, ["payload/processes/content/keyedHistograms/MAIN_THREAD_RUNNABLE_MS"])
# In[5]:
def count(d):
keyed = d['payload/processes/content/keyedHistograms/MAIN_THREAD_RUNNABLE_MS'] or {}
result = []
for key in keyed:
hist = keyed[key]
values = hist['values']
s = 0
for index in values:
s += values[index]
result.append((key, s))
return result
# In[6]:
freq = subset.flatMap(count).reduceByKey(lambda a, b: a+b).collect()
# In[7]:
freq.sort(key=lambda d: d[1], reverse=True)
# In[8]:
def is_labeled(name):
if name.endswith('(labeled)') or 'PVsync' in name or 'Idle' in name:
return True
return False
# In[9]:
freq_filt = [ (name, v) for (name, v) in freq if not name.startswith('PJavaScript') ]
# In[14]:
[ (i, name, v) for (i, (name, v)) in enumerate(freq_filt) if not is_labeled(name) ][:18]
# In[17]:
[ (name, v) for (name, v) in freq_filt if not is_labeled(name) ]
# In[12]:
total = sum((d[1] for d in freq_filt))
target = total * 0.99
sofar = 0
labeled = 0
unlabeled = 0
for (i, (name, count)) in enumerate(freq_filt):
sofar += count
if is_labeled(name):
labeled += 1
else:
unlabeled += 1
if sofar >= target:
target_index = i
break
target_index, labeled, unlabeled
# In[13]:
labeled = 0
unlabeled = 0
for (name, count) in freq_filt:
if is_labeled(name):
labeled += count
else:
unlabeled += count
print labeled / float(labeled + unlabeled)
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment