Skip to content

Instantly share code, notes, and snippets.

@jtg567
Last active August 24, 2017 20:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jtg567/9c048993a2710b5e6df73b2244de066e to your computer and use it in GitHub Desktop.
Save jtg567/9c048993a2710b5e6df73b2244de066e to your computer and use it in GitHub Desktop.
Race Cache 2 Ping Pull
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# In[1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import datetime as DT
from plotly.graph_objs import *
from moztelemetry import Dataset, get_pings_properties
from operator import itemgetter
from pprint import pprint as pp
get_ipython().magic(u'pylab inline')
pyplot.switch_backend('agg')
# how many cores active on this cluster?
print "\n"+str(sc.defaultParallelism)+"\n"
# specify exact date or range
date_range = ('20170726', '20170802') # exact strings
print "date_range start: "+date_range[0]+", date_range end: "+date_range[1]+"\n"
# sample rate
sr = 1
# In[2]:
cohorts = Dataset.from_source("telemetry-cohorts")
cohorts.schema
experiment = "pref-flip-rcwn2-1381816"
RDD = cohorts.where(
submissionDate= lambda sD: sD >= date_range[0] and sD <= date_range[1],
experimentId=experiment).records(sc, sample= sr)
# In[3]:
filtered = RDD.filter(lambda p: p['meta'].get('docType', "None") == 'main')
# In[4]:
props = dict(
clientId = "clientId",
channel= "application/channel",
FxVersion= "application/version",
osName= "environment/system/os/name",
osVersion= "environment/system/os/version",
branch= "environment/experiments/pref-flip-rcwn2-1381816/branch",
nrc_wn_oosd= "payload/histograms/NETWORK_RACE_CACHE_WITH_NETWORK_OCEC_ON_START_DIFF",
nrc_wn_st= "payload/histograms/NETWORK_RACE_CACHE_WITH_NETWORK_SAVED_TIME",
nrc_wn_u2= "payload/histograms/NETWORK_RACE_CACHE_WITH_NETWORK_USAGE_2",
nrc_bw_rnw= "payload/histograms/NETWORK_RACE_CACHE_BANDWIDTH_RACE_NETWORK_WIN",
nrc_bw_rcw= "payload/histograms/NETWORK_RACE_CACHE_BANDWIDTH_RACE_CACHE_WIN",
nrc_bw_nr= "payload/histograms/NETWORK_RACE_CACHE_BANDWIDTH_NOT_RACE",
nrc_val= "payload/histograms/NETWORK_RACE_CACHE_VALIDATION",
hpc_l1= "payload/histograms/HTTP_PAGE_COMPLETE_LOAD",
hpc_l2= "payload/histograms/HTTP_PAGE_COMPLETE_LOAD_V2",
hpc_ln= "payload/histograms/HTTP_PAGE_COMPLETE_LOAD_NET_V2",
hpc_lc= "payload/histograms/HTTP_PAGE_COMPLETE_LOAD_CACHED_V2",
hsc_l1= "payload/histograms/HTTP_SUB_COMPLETE_LOAD",
hsc_l2= "payload/histograms/HTTP_SUB_COMPLETE_LOAD_V2",
hsc_lc= "payload/histograms/HTTP_SUB_COMPLETE_LOAD_CACHED_V2",
hsc_ln= "payload/histograms/HTTP_SUB_COMPLETE_LOAD_NET_V2",
tcplt= "payload/histograms/TOTAL_CONTENT_PAGE_LOAD_TIME",
)
RDD_prop = get_pings_properties(filtered, props)
# In[5]:
RDD_prop.take(1)
# In[ ]:
byClient = RDD_prop.map(lambda p: (p['clientId'], [p])).reduceByKey(lambda x,y: x+y)
# In[ ]:
def gather(pinglist, prop):
return [x[prop] for x in pinglist]
#def hist
def doIT(p):
clientId, pinglist = p
return dict(
clientId = clientId,
branch= np.unique(gather(pinglist, 'branch')),
pings= len(pinglist),
channel= np.unique(gather(pinglist, 'channel')),
os= np.unique(gather(pinglist, 'osName')) + np.unique(gather(pinglist, 'osVersion')),
)
fin = byClient.map(doIT)
# In[ ]:
pd.DataFrame(fin.collect()).to_csv(experiment +".csv", index=False)
# In[ ]:
RDD_prop.map(lambda x: ','.join(x)).repartition(sc.defaultParallelism).saveAsTextFile("s3://telemetry-private-analysis-2/jgaunt/"+experiment+".csv")
print "file saved at: s3://telemetry-private-analysis-2/jgaunt/"+experiment+".csv"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment