Skip to content

Instantly share code, notes, and snippets.

@georgf
Created May 24, 2016 10:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save georgf/56449b46623ce2493b01a82d0dd08e6e to your computer and use it in GitHub Desktop.
Save georgf/56449b46623ce2493b01a82d0dd08e6e to your computer and use it in GitHub Desktop.
validate created and date header
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# ### [Bug 1271391](https://bugzilla.mozilla.org/show_bug.cgi?id=1271391) - Validate Fennec Date header & creation date
# In[85]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import datetime as dt
from uuid import UUID
import re
import email.utils as eut
import datetime
import time
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history
get_ipython().magic(u'pylab inline')
# The Date header landed [2016-05-10](https://hg.mozilla.org/mozilla-central/rev/fdece96f5cf5). This is available in ``"meta/Date"``.
#
# The ping creation date field and timezone (``created`` and ``tz``) also landed 2016-05-10, bumping the core ping version to 5 - see:
# * https://hg.mozilla.org/mozilla-central/rev/cd0c3acb37e0
# * https://hg.mozilla.org/mozilla-central/rev/40958aebbb80
# In[86]:
submission_dates = ("20160510", "20160524")
pings5 = get_pings(sc,
app="Fennec",
channel="nightly",
doc_type="core",
source_version="5",
submission_date=submission_dates,
fraction=1.0)
pings6 = get_pings(sc,
app="Fennec",
channel="nightly",
doc_type="core",
source_version="6",
submission_date=submission_dates,
fraction=1.0)
# In[87]:
merged = pings5 + pings6
properties = ['meta/Date', 'meta/submissionDate', 'clientId', 'created', 'tz']
pings = get_pings_properties(merged, properties)
# The data extract we work with looks like this:
# In[88]:
p = pings.first()
p['clientId'] = '...'
p
# ### Validate the expected ping contents.
# Define a helper for the ``Date`` header validation.
# In[102]:
def valid_date_header(s):
date_pattern = '^\D+, \d{1,2} \D+ \d{4,4} \d\d:\d\d:\d\d GMT(\\+00:00)?$'
return re.match(date_pattern, s) != None
valid_date_header(pings.first()['meta/Date']) and valid_date_header('Sat, 21 May 2016 00:03:59 GMT+00:00') and valid_date_header('Sat, 21 May 2016 04:06:41 GMT')
# In[103]:
def ping_check(ping):
props = {
'meta/Date': [unicode],
'meta/submissionDate': [unicode],
'clientId': [unicode],
'created': [unicode],
'tz': [int, long]
}
for k,types in props.iteritems():
if not k in ping:
return 'missing field: ' + prop
if type(ping[k]) not in types:
return 'wrong type for ' + k
if not re.match('^\d\d\d\d-\d\d-\d\d$', ping['created']):
return 'invalid created date'
if not valid_date_header(p['meta/Date']):
return 'invalid date header'
tz = ping['tz']
if tz < -24*60 or tz > 24*60:
return 'invalid timezone value'
return ''
# In[104]:
results = pings.map(lambda p: (ping_check(p), p))
# In[105]:
results.countByKey()
# So all the expected fields are present and in a valid format.
# Lets show an example for each group.
# In[107]:
ps = results.groupByKey().map(lambda t: (t[0], list(t[1])[0])).collect()
for k,p in ps:
p['clientId'] = '...'
ps
# ### Cross-check 'created' and Date header
# Now lets do some sanity checking on both the ``created`` field as well the ``Date`` header.
# As both can be affected by clock skew, a reasonable assumption seems to be that ``created`` and ``Date`` should usually be close to each other.
# Some discrepancies are expected due to delayed uploads, from jumping clocks, etc.
# In[108]:
def delta_days(p):
createdDate = datetime.datetime.strptime(p['created'], '%Y-%m-%d')
headerDate = datetime.datetime.fromtimestamp(time.mktime(eut.parsedate(p['meta/Date'])))
d = headerDate - createdDate
return d.days
deltas = pings.map(lambda p: delta_days(p))
# In[109]:
delta_series = pd.Series(deltas.collect())
delta_series.describe(percentiles=[.25, .5, .75, .90, .95, .99])
# In[110]:
delta_series.plot(kind='hist', bins=20, logy=True, title='Histogram of date deltas in days')
# How many clients actually have a date deltas >1 day?
# First lets get the overall client count.
# In[111]:
overall = get_one_ping_per_client(pings).count()
overall
# Now lets break the deltas between "created" and the Date header down into a "no. of clients of by N days" distribution. To keep this bounded we count all deltas >10 days as 11.
# In[112]:
def bucket_day(p):
return min(11, abs(delta_days(p)))
max_deltas = pings.map(lambda p: (p['clientId'], bucket_day(p))) .groupByKey() .mapValues(lambda deltas: max(list(deltas))) .values() .countByValue()
max_deltas
# In[113]:
off_series = pd.Series(max_deltas)
off_series.plot(kind='bar', logy=True, title='Chart of no. of clients off by N days')
# This looks like rather normal falloff.
# ### tz distribution
# Now lets look at how the values for the timezone offset field, ``tz``, are distributed.
#
# The values of the field are in minutes, we convert them to hours for easier viewing.
# In[114]:
tzs = pings.map(lambda p: p['tz'] / 60)
tz_series = pd.Series(tzs.collect())
tz_series.describe(percentiles=[.25, .5, .75, .90, .95, .99])
# In[115]:
tz_series.plot(kind='hist', bins=50, logy=True, title='Histogram of tz values in hours')
# This looks pretty sane.
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment