Created
May 24, 2016 10:29
-
-
Save georgf/56449b46623ce2493b01a82d0dd08e6e to your computer and use it in GitHub Desktop.
validate created and date header
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# ### [Bug 1271391](https://bugzilla.mozilla.org/show_bug.cgi?id=1271391) - Validate Fennec Date header & creation date | |
# In[85]: | |
import ujson as json | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
import datetime as dt | |
from uuid import UUID | |
import re | |
import email.utils as eut | |
import datetime | |
import time | |
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history | |
get_ipython().magic(u'pylab inline') | |
# The Date header landed [2016-05-10](https://hg.mozilla.org/mozilla-central/rev/fdece96f5cf5). This is available in ``"meta/Date"``. | |
# | |
# The ping creation date field and timezone (``created`` and ``tz``) also landed 2016-05-10, bumping the core ping version to 5 - see: | |
# * https://hg.mozilla.org/mozilla-central/rev/cd0c3acb37e0 | |
# * https://hg.mozilla.org/mozilla-central/rev/40958aebbb80 | |
# In[86]: | |
submission_dates = ("20160510", "20160524") | |
pings5 = get_pings(sc, | |
app="Fennec", | |
channel="nightly", | |
doc_type="core", | |
source_version="5", | |
submission_date=submission_dates, | |
fraction=1.0) | |
pings6 = get_pings(sc, | |
app="Fennec", | |
channel="nightly", | |
doc_type="core", | |
source_version="6", | |
submission_date=submission_dates, | |
fraction=1.0) | |
# In[87]: | |
merged = pings5 + pings6 | |
properties = ['meta/Date', 'meta/submissionDate', 'clientId', 'created', 'tz'] | |
pings = get_pings_properties(merged, properties) | |
# The data extract we work with looks like this: | |
# In[88]: | |
p = pings.first() | |
p['clientId'] = '...' | |
p | |
# ### Validate the expected ping contents. | |
# Define a helper for the ``Date`` header validation. | |
# In[102]: | |
def valid_date_header(s): | |
date_pattern = '^\D+, \d{1,2} \D+ \d{4,4} \d\d:\d\d:\d\d GMT(\\+00:00)?$' | |
return re.match(date_pattern, s) != None | |
valid_date_header(pings.first()['meta/Date']) and valid_date_header('Sat, 21 May 2016 00:03:59 GMT+00:00') and valid_date_header('Sat, 21 May 2016 04:06:41 GMT') | |
# In[103]: | |
def ping_check(ping): | |
props = { | |
'meta/Date': [unicode], | |
'meta/submissionDate': [unicode], | |
'clientId': [unicode], | |
'created': [unicode], | |
'tz': [int, long] | |
} | |
for k,types in props.iteritems(): | |
if not k in ping: | |
return 'missing field: ' + prop | |
if type(ping[k]) not in types: | |
return 'wrong type for ' + k | |
if not re.match('^\d\d\d\d-\d\d-\d\d$', ping['created']): | |
return 'invalid created date' | |
if not valid_date_header(p['meta/Date']): | |
return 'invalid date header' | |
tz = ping['tz'] | |
if tz < -24*60 or tz > 24*60: | |
return 'invalid timezone value' | |
return '' | |
# In[104]: | |
results = pings.map(lambda p: (ping_check(p), p)) | |
# In[105]: | |
results.countByKey() | |
# So all the expected fields are present and in a valid format. | |
# Lets show an example for each group. | |
# In[107]: | |
ps = results.groupByKey().map(lambda t: (t[0], list(t[1])[0])).collect() | |
for k,p in ps: | |
p['clientId'] = '...' | |
ps | |
# ### Cross-check 'created' and Date header | |
# Now lets do some sanity checking on both the ``created`` field as well the ``Date`` header. | |
# As both can be affected by clock skew, a reasonable assumption seems to be that ``created`` and ``Date`` should usually be close to each other. | |
# Some discrepancies are expected due to delayed uploads, from jumping clocks, etc. | |
# In[108]: | |
def delta_days(p): | |
createdDate = datetime.datetime.strptime(p['created'], '%Y-%m-%d') | |
headerDate = datetime.datetime.fromtimestamp(time.mktime(eut.parsedate(p['meta/Date']))) | |
d = headerDate - createdDate | |
return d.days | |
deltas = pings.map(lambda p: delta_days(p)) | |
# In[109]: | |
delta_series = pd.Series(deltas.collect()) | |
delta_series.describe(percentiles=[.25, .5, .75, .90, .95, .99]) | |
# In[110]: | |
delta_series.plot(kind='hist', bins=20, logy=True, title='Histogram of date deltas in days') | |
# How many clients actually have a date deltas >1 day? | |
# First lets get the overall client count. | |
# In[111]: | |
overall = get_one_ping_per_client(pings).count() | |
overall | |
# Now lets break the deltas between "created" and the Date header down into a "no. of clients of by N days" distribution. To keep this bounded we count all deltas >10 days as 11. | |
# In[112]: | |
def bucket_day(p): | |
return min(11, abs(delta_days(p))) | |
max_deltas = pings.map(lambda p: (p['clientId'], bucket_day(p))) .groupByKey() .mapValues(lambda deltas: max(list(deltas))) .values() .countByValue() | |
max_deltas | |
# In[113]: | |
off_series = pd.Series(max_deltas) | |
off_series.plot(kind='bar', logy=True, title='Chart of no. of clients off by N days') | |
# This looks like rather normal falloff. | |
# ### tz distribution | |
# Now lets look at how the values for the timezone offset field, ``tz``, are distributed. | |
# | |
# The values of the field are in minutes, we convert them to hours for easier viewing. | |
# In[114]: | |
tzs = pings.map(lambda p: p['tz'] / 60) | |
tz_series = pd.Series(tzs.collect()) | |
tz_series.describe(percentiles=[.25, .5, .75, .90, .95, .99]) | |
# In[115]: | |
tz_series.plot(kind='hist', bins=50, logy=True, title='Histogram of tz values in hours') | |
# This looks pretty sane. | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment