Skip to content

Instantly share code, notes, and snippets.

@georgf
Last active May 26, 2016 12:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save georgf/2d8cfccefbec1b89b54dd03f44d3ad2a to your computer and use it in GitHub Desktop.
Save georgf/2d8cfccefbec1b89b54dd03f44d3ad2a to your computer and use it in GitHub Desktop.
fennec-uploader-validation-49
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# ## [Bug 1268513](https://bugzilla.mozilla.org/show_bug.cgi?id=1268513) - Validate "core" ping re-uploader in Fennec 49 Nightly
# Validate "core" pings sent by Firefox for Android 46 to make sure the data and volumes look sane.
# In[6]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import datetime as dt
from uuid import UUID
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history
get_ipython().magic(u'pylab inline')
# We get the pings seperately from 3 different build ranges here:
# * ``old``- before [bug 1268513](https://bugzilla.mozilla.org/show_bug.cgi?id=1268513)
# * ``suspect`` - after bug 1268513 but before all the [bug fixes mentioned here](https://bugzilla.mozilla.org/show_bug.cgi?id=1268513#c12)
# * ``new`` - after all the bug fixes
# In[7]:
def filtered(pings):
return get_pings_properties(pings, ["os", "clientId", "seq", "meta/documentId", "meta/submissionDate"]) .filter(lambda p: p.get("os", "") == "Android")
old_pings = filtered(get_pings(sc,
app="Fennec",
channel="nightly",
doc_type="core",
source_version="*",
submission_date=("20160418", "20160527"),
build_id=("20160418000000", "20160503000000"),
fraction=1.0))
suspect_pings = filtered(get_pings(sc,
app="Fennec",
channel="nightly",
doc_type="core",
source_version="*",
submission_date=("20160504", "20160527"),
build_id=("20160504000000", "20160515000000"),
fraction=1.0))
new_pings = filtered(get_pings(sc,
app="Fennec",
channel="nightly",
doc_type="core",
source_version="*",
submission_date=("20160516", "20160527"),
build_id=("20160516000000", "20160524000000"),
fraction=1.0))
# How many pings are we looking at?
# In[8]:
old_ping_counts = old_pings.count()
suspect_ping_counts = suspect_pings.count()
new_ping_counts = new_pings.count()
(old_ping_counts, suspect_ping_counts, new_ping_counts)
# How many different clients are we seeing?
# In[9]:
old_client_count = get_one_ping_per_client(old_pings).count()
suspect_client_count = get_one_ping_per_client(suspect_pings).count()
new_client_count = get_one_ping_per_client(new_pings).count()
(old_client_count, new_client_count)
# Dedupe pings.
# In[10]:
def dedupe(pings):
return pings.map(lambda p: (p["meta/documentId"], p)) .reduceByKey(lambda a, b: a) .values()
old_pings = dedupe(old_pings)
suspect_pings = dedupe(suspect_pings)
new_pings = dedupe(new_pings)
# How many duplicates did we discard?
# In[11]:
old_deduped_count = old_pings.count()
suspect_deduped_count = suspect_pings.count()
new_deduped_count = new_pings.count()
old_discarded = old_ping_counts - old_deduped_count
suspect_discarded = suspect_ping_counts - suspect_deduped_count
new_discarded = new_ping_counts - new_deduped_count
# In[12]:
print "old discarded: ", (old_discarded, round(float(old_discarded) / old_ping_counts, 3))
print "suspect discarded: ", (suspect_discarded, round(float(suspect_discarded) / suspect_ping_counts, 3))
print "new discarded: ", (new_discarded, round(float(new_discarded) / new_ping_counts, 3))
# So we see quite some more duplicate pings now.
# ### How is the daily submission count progressing?
# In[13]:
merged = old_pings + suspect_pings + new_pings
# In[14]:
counts_by_day = merged.map(lambda p: p["meta/submissionDate"]) .countByValue()
# In[15]:
daily_counts = pd.Series(counts_by_day)
daily_counts.describe()
# In[16]:
daily_counts.plot(title="Daily ping counts")
# ### How many clients do we see per day?
# In[17]:
adi = merged.map(lambda p: (p["meta/submissionDate"], p["clientId"])) .aggregateByKey(set(), lambda s, id: s.add(id) or s, lambda s1, s2: s1.update(s2) or s1) .map(lambda t: (t[0], len(t[1])))
# In[18]:
adi_series = pd.Series(dict(sorted(adi.collect())))
adi_series.describe()
# In[19]:
adi_series.plot(title="ADI")
# ### Extract the ping sequence gaps per client
# In[21]:
def get_seq_gaps(pings):
seqs = sorted([p["seq"] for p in pings])
return filter(lambda x: x > 0, np.diff(seqs) - 1)
def grouped_gaps(pings):
return pings.groupBy(lambda p: p["clientId"]) .map(lambda t: (t[0], get_seq_gaps(t[1])))
def find_clients_with_gaps(pings):
return grouped_gaps(pings).filter(lambda t: len(t[1]) > 0)
# In[25]:
gaps_new = find_clients_with_gaps(new_pings).cache()
gaps_sus = find_clients_with_gaps(suspect_pings).cache()
gaps_old = find_clients_with_gaps(old_pings).cache()
# How many clients are affected?
# In[29]:
def pct(a, b):
return round(float(a) / b, 3)
# In[31]:
print "old: ", gaps_old.count(), pct(gaps_old.count(), old_client_count)
print "suspect: ", gaps_sus.count(), pct(gaps_sus.count(), suspect_client_count)
print "new: ", gaps_new.count(), pct(gaps_new.count(), new_client_count)
# Good progress, much fewer clients are affected by missing pings - 3.1% vs. previously 20.4%.
#
# I'm surprised about the high number of affected clients before the change. I think we haven't explicitly looked at that number before?
# From the progression it seems that the [bugs mentioned here](https://bugzilla.mozilla.org/show_bug.cgi?id=1268513#c12) had a positive impact.
# ### Look at overall gap counts
# In[32]:
all_gaps_old = gaps_old.values().aggregate([], lambda a,b: a+b, lambda a,b: a+b)
all_gaps_sus = gaps_sus.values().aggregate([], lambda a,b: a+b, lambda a,b: a+b)
all_gaps_new = gaps_new.values().aggregate([], lambda a,b: a+b, lambda a,b: a+b)
# In[33]:
(pct(sum(all_gaps_old), old_deduped_count),
pct(sum(all_gaps_sus), suspect_deduped_count),
pct(sum(all_gaps_new), new_deduped_count))
# Hm, that seems pretty high? But the missing ping count just dropped by a third with the new uploader, so thats progress.
#
# We need to monitor this closely on Beta and figure out whether this is strange Nightly behavior or a general problem.
# ### Lets look at the gap distributions
# In[34]:
old_gaps_series = pd.Series(all_gaps_old)
old_gaps_series.describe(percentiles=[.25, .5, .75, .95, .99, .995])
# In[35]:
new_gaps_series = pd.Series(all_gaps_new)
new_gaps_series.describe(percentiles=[.25, .5, .75, .95, .99, .995])
# In[36]:
fig, ax = plt.subplots()
old_gaps_series.hist(ax=ax, bins=100, bottom=0.1)
ax.set_yscale('log')
# In[37]:
fig, ax = plt.subplots()
new_gaps_series.hist(ax=ax, bins=100, bottom=0.1)
ax.set_yscale('log')
# ### Lets look at the distribution of the per client gap counts
# In[38]:
client_gaps_old = gaps_old.values().map(lambda gs: sum(gs)).collect()
client_gaps_new = gaps_new.values().map(lambda gs: sum(gs)).collect()
# In[39]:
old_client_gaps_series = pd.Series(client_gaps_old)
fig, ax = plt.subplots()
old_client_gaps_series.hist(ax=ax, bins=100, bottom=0.1)
ax.set_yscale('log')
# In[40]:
new_client_gaps_series = pd.Series(client_gaps_new)
fig, ax = plt.subplots()
new_client_gaps_series.hist(ax=ax, bins=100, bottom=0.1)
ax.set_yscale('log')
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment