Skip to content

Instantly share code, notes, and snippets.

@bsmedberg
Created May 24, 2016 17:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bsmedberg/0f84d540eca797bfe3edde432006317c to your computer and use it in GitHub Desktop.
Save bsmedberg/0f84d540eca797bfe3edde432006317c to your computer and use it in GitHub Desktop.
ping-latency
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# In[39]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import IPython
from __future__ import division
from montecarlino import grouped_permutation_test
from moztelemetry.spark import get_pings, get_pings_properties
get_ipython().magic(u'pylab inline')
IPython.core.pylabtools.figsize(16, 7)
from operator import add
import datetime
import itertools
import collections
import dateutil
import dateutil.parser
# In[2]:
sc.defaultParallelism
# In[ ]:
def get_property(value, path):
for key in path:
if not isinstance(value, dict) or key not in value:
return None
value = value[key]
return value
# In[4]:
criteria = {
'app': 'Firefox',
'channel': 'beta',
'version': '47.0',
'submission_date': "20160522",
}
main_pings = get_pings(sc, doc_type="main", **criteria)
# In[5]:
crash_pings = get_pings(sc, doc_type="crash", **criteria)
# In[18]:
# In[122]:
both_properties = [
"meta/Timestamp",
"creationDate",
]
main_properties = both_properties + [
"payload/info/subsessionStartDate",
"payload/info/subsessionLength",
]
crash_properties = both_properties + [
"payload/crashDate",
"payload/metadata/CrashTime",
]
main_data = get_pings_properties(main_pings, main_properties)
crash_data = get_pings_properties(crash_pings, crash_properties)
# In[123]:
main_data = main_data.cache()
crash_data = crash_data.cache()
# In[124]:
fm = main_data.first()
fm
# In[29]:
fc = crash_data.first()
fc
# In[134]:
df = None
# In[70]:
errcount = sc.accumulator(0)
def cd_to_timestamp(crash_ping):
try:
cd = dateutil.parser.parse(crash_ping['payload/crashDate']).date()
rd = datetime.date(2016, 5, 22)
days = (rd - cd).days
if days < 0:
days = -1
elif days > 20:
days = 21
yield days
except ValueError:
errcount.add(1)
pass
crash_ping_latency = crash_data.flatMap(cd_to_timestamp).countByValue()
print "errcount", errcount.value
# In[130]:
crash_df = pd.DataFrame(data=crash_ping_latency.items(), columns=("Latency", "Count")).sort_values(("Latency"))
crash_total = crash_df['Count'].sum()
by_pct = crash_df.assign(pct=crash_df["Count"].div(crash_total / 100))
by_pct.plot.bar(x="Latency", y="pct",
title="Latency of activity date (local time) against receipt date (UTC) for crash pings")
# In[135]:
main_errcount = sc.accumulator(0)
def cd_to_timestamp(main_ping):
if main_ping['payload/info/subsessionStartDate'] is None:
return
try:
hours = (main_ping["payload/info/subsessionLength"] or 0) / 3600.0
if hours > 25:
hours = 25
elif hours < 0:
return
cd = dateutil.parser.parse(main_ping['payload/info/subsessionStartDate']).date()
rd = datetime.date(2016, 5, 22)
days = (rd - cd).days
if days < 0:
days = -1
elif days > 20:
days = 21
yield (days, hours)
except ValueError:
main_errcount.add(1)
pass
main_ping_latency = main_data.flatMap(cd_to_timestamp).foldByKey(0, add).collect()
print "errcount", main_errcount.value
main_df = pd.DataFrame(data=main_ping_latency, columns=("Latency", "Hours")).sort_values(("Latency"))
main_df
# In[133]:
main_total = main_df['Hours'].sum()
main_by_pct = main_df.assign(pct=main_df["Hours"].div(main_total / 100))
main_by_pct.plot.bar(x="Latency", y="pct",
title="Latency of activity date (local time) against receipt date (UTC) for main pings, by usage-hour")
# In[ ]:
cd_errcount = sc.accumulator(0)
def ssd_to_cd(main_ping):
if main_ping['payload/info/subsessionStartDate'] is None:
return
try:
hours = (main_ping["payload/info/subsessionLength"] or 0) / 3600.0
if hours > 25:
hours = 25
elif hours < 0:
return
ssd = dateutil.parser.parse(main_ping['payload/info/subsessionStartDate']).date()
cd = dateutil.parser.parse(main_ping['creationDate']).date()
days = (cd - ssd).days
if days < 0:
days = -1
elif days > 20:
days = 21
yield (days, hours)
except ValueError:
cd_errcount.add(1)
pass
cd_latency = main_data.flatMap(ssd_to_cd).foldByKey(0, add).collect()
print "errcount", cd_errcount.value
# In[138]:
cd_df = pd.DataFrame(data=main_ping_latency, columns=("Latency", "Hours")).sort_values(("Latency"))
cd_total = cd_df['Hours'].sum()
cd_by_pct = cd_df.assign(pct=cd_df["Hours"].div(cd_total / 100))
cd_by_pct.plot.bar(x="Latency", y="pct",
title="Latency of activity date (local time) against ping creation date (UTC) for main pings, by usage-hour")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment