Skip to content

Instantly share code, notes, and snippets.

@bsmedberg
Created May 24, 2016 17:08
Show Gist options
  • Save bsmedberg/6431e257f9ca2648125a83776fcb52a3 to your computer and use it in GitHub Desktop.
Save bsmedberg/6431e257f9ca2648125a83776fcb52a3 to your computer and use it in GitHub Desktop.
Ping latency
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# In[39]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import IPython
from __future__ import division
from montecarlino import grouped_permutation_test
from moztelemetry.spark import get_pings, get_pings_properties
get_ipython().magic(u'pylab inline')
IPython.core.pylabtools.figsize(16, 7)
from operator import add
import datetime
import itertools
import collections
import dateutil
import dateutil.parser
# In[2]:
sc.defaultParallelism
# In[ ]:
def get_property(value, path):
for key in path:
if not isinstance(value, dict) or key not in value:
return None
value = value[key]
return value
# In[4]:
criteria = {
'app': 'Firefox',
'channel': 'beta',
'version': '47.0',
'submission_date': "20160522",
}
main_pings = get_pings(sc, doc_type="main", **criteria)
# In[5]:
crash_pings = get_pings(sc, doc_type="crash", **criteria)
# In[18]:
# In[122]:
both_properties = [
"meta/Timestamp",
"creationDate",
]
main_properties = both_properties + [
"payload/info/subsessionStartDate",
"payload/info/subsessionLength",
]
crash_properties = both_properties + [
"payload/crashDate",
"payload/metadata/CrashTime",
]
main_data = get_pings_properties(main_pings, main_properties)
crash_data = get_pings_properties(crash_pings, crash_properties)
# In[123]:
main_data = main_data.cache()
crash_data = crash_data.cache()
# In[124]:
fm = main_data.first()
fm
# In[29]:
fc = crash_data.first()
fc
# In[70]:
errcount = sc.accumulator(0)
def cd_to_timestamp(crash_ping):
try:
cd = dateutil.parser.parse(crash_ping['payload/crashDate']).date()
rd = datetime.date(2016, 5, 22)
days = (rd - cd).days
if days < 0:
days = -1
elif days > 20:
days = 21
yield days
except ValueError:
errcount.add(1)
pass
crash_ping_latency = crash_data.flatMap(cd_to_timestamp).countByValue()
print "errcount", errcount.value
# In[130]:
crash_df = pd.DataFrame(data=crash_ping_latency.items(), columns=("Latency", "Count")).sort_values(("Latency"))
crash_total = crash_df['Count'].sum()
by_pct = crash_df.assign(pct=crash_df["Count"].div(crash_total / 100))
by_pct.plot.bar(x="Latency", y="pct",
title="Latency of activity date (local time) against receipt date (UTC) for crash pings")
# In[127]:
main_errcount = sc.accumulator(0)
def cd_to_timestamp(main_ping):
if main_ping['payload/info/subsessionStartDate'] is None:
return
try:
hours = (main_ping["payload/info/subsessionLength"] or 0) / 3600.0
if hours > 25:
hours = 25
elif hours < 0:
return
cd = dateutil.parser.parse(main_ping['payload/info/subsessionStartDate']).date()
rd = datetime.date(2016, 5, 22)
days = (rd - cd).days
if days < 0:
days = -1
elif days > 20:
days = 21
yield (days, hours)
except ValueError:
errcount.add(1)
pass
main_ping_latency = main_data.flatMap(cd_to_timestamp).foldByKey(0, add).collect()
print "errcount", main_errcount.value
main_df = pd.DataFrame(data=main_ping_latency, columns=("Latency", "Count")).sort_values(("Latency"))
main_df
# In[132]:
main_total = df['Count'].sum()
main_by_pct = main_df.assign(pct=main_df["Count"].div(main_total / 100))
main_by_pct.plot.bar(x="Latency", y="pct",
title="Latency of activity date (local time) against receipt date (UTC) for main pings, by usage-hour")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment