Last active
May 26, 2016 12:06
-
-
Save georgf/2d8cfccefbec1b89b54dd03f44d3ad2a to your computer and use it in GitHub Desktop.
fennec-uploader-validation-49
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# ## [Bug 1268513](https://bugzilla.mozilla.org/show_bug.cgi?id=1268513) - Validate "core" ping re-uploader in Fennec 49 Nightly | |
# Validate "core" pings sent by Firefox for Android 46 to make sure the data and volumes look sane. | |
# In[6]: | |
import ujson as json | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
import datetime as dt | |
from uuid import UUID | |
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history | |
get_ipython().magic(u'pylab inline') | |
# We get the pings seperately from 3 different build ranges here: | |
# * ``old``- before [bug 1268513](https://bugzilla.mozilla.org/show_bug.cgi?id=1268513) | |
# * ``suspect`` - after bug 1268513 but before all the [bug fixes mentioned here](https://bugzilla.mozilla.org/show_bug.cgi?id=1268513#c12) | |
# * ``new`` - after all the bug fixes | |
# In[7]: | |
def filtered(pings): | |
return get_pings_properties(pings, ["os", "clientId", "seq", "meta/documentId", "meta/submissionDate"]) .filter(lambda p: p.get("os", "") == "Android") | |
old_pings = filtered(get_pings(sc, | |
app="Fennec", | |
channel="nightly", | |
doc_type="core", | |
source_version="*", | |
submission_date=("20160418", "20160527"), | |
build_id=("20160418000000", "20160503000000"), | |
fraction=1.0)) | |
suspect_pings = filtered(get_pings(sc, | |
app="Fennec", | |
channel="nightly", | |
doc_type="core", | |
source_version="*", | |
submission_date=("20160504", "20160527"), | |
build_id=("20160504000000", "20160515000000"), | |
fraction=1.0)) | |
new_pings = filtered(get_pings(sc, | |
app="Fennec", | |
channel="nightly", | |
doc_type="core", | |
source_version="*", | |
submission_date=("20160516", "20160527"), | |
build_id=("20160516000000", "20160524000000"), | |
fraction=1.0)) | |
# How many pings are we looking at? | |
# In[8]: | |
old_ping_counts = old_pings.count() | |
suspect_ping_counts = suspect_pings.count() | |
new_ping_counts = new_pings.count() | |
(old_ping_counts, suspect_ping_counts, new_ping_counts) | |
# How many different clients are we seeing? | |
# In[9]: | |
old_client_count = get_one_ping_per_client(old_pings).count() | |
suspect_client_count = get_one_ping_per_client(suspect_pings).count() | |
new_client_count = get_one_ping_per_client(new_pings).count() | |
(old_client_count, new_client_count) | |
# Dedupe pings. | |
# In[10]: | |
def dedupe(pings): | |
return pings.map(lambda p: (p["meta/documentId"], p)) .reduceByKey(lambda a, b: a) .values() | |
old_pings = dedupe(old_pings) | |
suspect_pings = dedupe(suspect_pings) | |
new_pings = dedupe(new_pings) | |
# How many duplicates did we discard? | |
# In[11]: | |
old_deduped_count = old_pings.count() | |
suspect_deduped_count = suspect_pings.count() | |
new_deduped_count = new_pings.count() | |
old_discarded = old_ping_counts - old_deduped_count | |
suspect_discarded = suspect_ping_counts - suspect_deduped_count | |
new_discarded = new_ping_counts - new_deduped_count | |
# In[12]: | |
print "old discarded: ", (old_discarded, round(float(old_discarded) / old_ping_counts, 3)) | |
print "suspect discarded: ", (suspect_discarded, round(float(suspect_discarded) / suspect_ping_counts, 3)) | |
print "new discarded: ", (new_discarded, round(float(new_discarded) / new_ping_counts, 3)) | |
# So we see quite some more duplicate pings now. | |
# ### How is the daily submission count progressing? | |
# In[13]: | |
merged = old_pings + suspect_pings + new_pings | |
# In[14]: | |
counts_by_day = merged.map(lambda p: p["meta/submissionDate"]) .countByValue() | |
# In[15]: | |
daily_counts = pd.Series(counts_by_day) | |
daily_counts.describe() | |
# In[16]: | |
daily_counts.plot(title="Daily ping counts") | |
# ### How many clients do we see per day? | |
# In[17]: | |
adi = merged.map(lambda p: (p["meta/submissionDate"], p["clientId"])) .aggregateByKey(set(), lambda s, id: s.add(id) or s, lambda s1, s2: s1.update(s2) or s1) .map(lambda t: (t[0], len(t[1]))) | |
# In[18]: | |
adi_series = pd.Series(dict(sorted(adi.collect()))) | |
adi_series.describe() | |
# In[19]: | |
adi_series.plot(title="ADI") | |
# ### Extract the ping sequence gaps per client | |
# In[21]: | |
def get_seq_gaps(pings): | |
seqs = sorted([p["seq"] for p in pings]) | |
return filter(lambda x: x > 0, np.diff(seqs) - 1) | |
def grouped_gaps(pings): | |
return pings.groupBy(lambda p: p["clientId"]) .map(lambda t: (t[0], get_seq_gaps(t[1]))) | |
def find_clients_with_gaps(pings): | |
return grouped_gaps(pings).filter(lambda t: len(t[1]) > 0) | |
# In[25]: | |
gaps_new = find_clients_with_gaps(new_pings).cache() | |
gaps_sus = find_clients_with_gaps(suspect_pings).cache() | |
gaps_old = find_clients_with_gaps(old_pings).cache() | |
# How many clients are affected? | |
# In[29]: | |
def pct(a, b): | |
return round(float(a) / b, 3) | |
# In[31]: | |
print "old: ", gaps_old.count(), pct(gaps_old.count(), old_client_count) | |
print "suspect: ", gaps_sus.count(), pct(gaps_sus.count(), suspect_client_count) | |
print "new: ", gaps_new.count(), pct(gaps_new.count(), new_client_count) | |
# Good progress, much fewer clients are affected by missing pings - 3.1% vs. previously 20.4%. | |
# | |
# I'm surprised about the high number of affected clients before the change. I think we haven't explicitly looked at that number before? | |
# From the progression it seems that the [bugs mentioned here](https://bugzilla.mozilla.org/show_bug.cgi?id=1268513#c12) had a positive impact. | |
# ### Look at overall gap counts | |
# In[32]: | |
all_gaps_old = gaps_old.values().aggregate([], lambda a,b: a+b, lambda a,b: a+b) | |
all_gaps_sus = gaps_sus.values().aggregate([], lambda a,b: a+b, lambda a,b: a+b) | |
all_gaps_new = gaps_new.values().aggregate([], lambda a,b: a+b, lambda a,b: a+b) | |
# In[33]: | |
(pct(sum(all_gaps_old), old_deduped_count), | |
pct(sum(all_gaps_sus), suspect_deduped_count), | |
pct(sum(all_gaps_new), new_deduped_count)) | |
# Hm, that seems pretty high? But the missing ping count just dropped by a third with the new uploader, so thats progress. | |
# | |
# We need to monitor this closely on Beta and figure out whether this is strange Nightly behavior or a general problem. | |
# ### Lets look at the gap distributions | |
# In[34]: | |
old_gaps_series = pd.Series(all_gaps_old) | |
old_gaps_series.describe(percentiles=[.25, .5, .75, .95, .99, .995]) | |
# In[35]: | |
new_gaps_series = pd.Series(all_gaps_new) | |
new_gaps_series.describe(percentiles=[.25, .5, .75, .95, .99, .995]) | |
# In[36]: | |
fig, ax = plt.subplots() | |
old_gaps_series.hist(ax=ax, bins=100, bottom=0.1) | |
ax.set_yscale('log') | |
# In[37]: | |
fig, ax = plt.subplots() | |
new_gaps_series.hist(ax=ax, bins=100, bottom=0.1) | |
ax.set_yscale('log') | |
# ### Lets look at the distribution of the per client gap counts | |
# In[38]: | |
client_gaps_old = gaps_old.values().map(lambda gs: sum(gs)).collect() | |
client_gaps_new = gaps_new.values().map(lambda gs: sum(gs)).collect() | |
# In[39]: | |
old_client_gaps_series = pd.Series(client_gaps_old) | |
fig, ax = plt.subplots() | |
old_client_gaps_series.hist(ax=ax, bins=100, bottom=0.1) | |
ax.set_yscale('log') | |
# In[40]: | |
new_client_gaps_series = pd.Series(client_gaps_new) | |
fig, ax = plt.subplots() | |
new_client_gaps_series.hist(ax=ax, bins=100, bottom=0.1) | |
ax.set_yscale('log') | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment