Skip to content

Instantly share code, notes, and snippets.

@Dexterp37
Last active December 21, 2016 21:23
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Dexterp37/56aeadf8520d0a6d9c23f24bb5609916 to your computer and use it in GitHub Desktop.
Save Dexterp37/56aeadf8520d0a6d9c23f24bb5609916 to your computer and use it in GitHub Desktop.
Bug 1303044 - Validate engagement measurements on Beta
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# ## Bug 1303044 - Validate engagement measurements on Beta
# In[1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import datetime as dt
from uuid import UUID
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history
get_ipython().magic(u'pylab inline')
# We get all the pings on Beta, after bug 1293222 landed.
# In[2]:
def dedupe(pings):
return pings.map(lambda p: (p["meta/documentId"], p)) .reduceByKey(lambda a, b: a) .values()
def filter(pings):
subset = get_pings_properties(pings, ["meta/clientId",
"meta/documentId",
"meta/submissionDate",
"environment/profile/creationDate",
"environment/profile/resetDate",
"environment/build",
"environment/partner",
"environment/system",
"payload/info/reason",
"payload/info/sessionId",
"payload/info/subsessionLength",
"payload/info/sessionLength",
"payload/info/profileSubsessionCounter",
"payload/processes/parent/scalars"])
return dedupe(subset)
latest_pings = filter(get_pings(sc,
app="Firefox",
channel="beta",
doc_type="main",
schema="v4",
submission_date=("20160926", "20161002"),
build_id=("20160920000000", "20160929999999"), # Post bug 1293222
fraction=1.0))
all_pings = latest_pings
# In[3]:
total_clients = latest_pings.map(lambda p: p["meta/clientId"]).distinct().count()
# ### Helper functions for plotting and analysing.
# In[4]:
def plot_series(series, graph_bins=100, graph_min=0.1):
# Plot to an histogram.
fig, ax = plt.subplots()
series.hist(ax=ax, bins=graph_bins, bottom=graph_min)
ax.set_yscale('log')
# Return some descriptive statistics.
return series.describe(percentiles=[.5, .75, .95, .99, .995])
def plot_histogram_scalar(pings, scalar_name):
scalar_values = pings.filter(lambda p: p["payload/processes/parent/scalars"] and p["payload/processes/parent/scalars"].get(scalar_name, False)) .map(lambda p: p["payload/processes/parent/scalars"][scalar_name])
scalar_series = pd.Series(scalar_values.collect())
return plot_series(scalar_series)
def values_per_day(pings, scalar):
# Each entry in the |daily_per_user| RDD is like:
# (("date", "clientId"), [ ... scalar values for the client, date ... ])
daily_per_user = pings.filter(lambda p: p["payload/processes/parent/scalars"] and p["payload/processes/parent/scalars"].get(scalar, False)) .map(lambda p: ((p["meta/submissionDate"], p["meta/clientId"]), [ p["payload/processes/parent/scalars"].get(scalar) ])) .reduceByKey(lambda a,b: a + b)
return daily_per_user
def pct(a, b):
return round(float(a) / b, 3)
# Make sure each ping has a scalar section and the contained engagament measurements scalar have the right formats.
# In[5]:
def engagement_measurements_check(p):
known_engagement_scalars = {
"browser.engagement.max_concurrent_tab_count": int,
"browser.engagement.max_concurrent_window_count": int,
"browser.engagement.tab_open_event_count": int,
"browser.engagement.total_uri_count": int,
"browser.engagement.unique_domains_count": int,
"browser.engagement.window_open_event_count": int,
}
# We know these scalars must be there for the referenced timeframe.
expected_scalars = [
"browser.engagement.max_concurrent_tab_count",
"browser.engagement.max_concurrent_window_count"
]
scalars = p["payload/processes/parent/scalars"]
if scalars is None:
return ("scalars section is None", p)
# We don't expect all the engagement measurements to be there but,
# if they are, make sure they have the correct format.
for k, v in known_engagement_scalars.iteritems():
if k in scalars:
if type(scalars[k]) != v:
return ("wrong type: " + k, p)
if scalars[k] < 0:
return ("check failed: " + k + " < 1", p)
# We're not expecting other scalars from these builds.
for k in scalars:
if k not in known_engagement_scalars:
return ("unexpected scalar: " + k, p)
for s in expected_scalars:
if s not in scalars:
return ("{} not reported".format(s), p)
return ("", p)
checked_pings = all_pings.map(engagement_measurements_check)
result_counts = checked_pings.countByKey()
result_counts
# ### Let's dig into the pings with missing engagement measurements.
# Report the counts as ratios for beter readability.
# In[6]:
total_pings = sum(result_counts.values())
{key: pct(value, total_pings) for (key, value) in result_counts.iteritems()}
# In[7]:
missing_eng = latest_pings.filter(lambda p: (p["payload/processes/parent/scalars"] != None) and (len(p["payload/processes/parent/scalars"].keys()) == 0))
# How many clients are sending an empy scalars section?
# In[8]:
num_clients_no_eng = len(missing_eng.map(lambda p: p["meta/clientId"]).distinct().collect())
print "Ratio of clients not sending engagement measurements:\t{}" .format(pct(num_clients_no_eng, total_clients))
# Let's check the distribution of the subsession lengths.
# In[9]:
missing_ssl = missing_eng.map(lambda p: p["payload/info/subsessionLength"]).collect()
plot_series(pd.Series(missing_ssl), 30, 0)
# ### Maximum Concurrent Tab Count
# In[10]:
plot_histogram_scalar(all_pings, "browser.engagement.max_concurrent_tab_count")
# What's the maximum number of concurrent tabs each user has, per day?
# In[11]:
daily_max_tabs_per_user = values_per_day(all_pings, "browser.engagement.max_concurrent_tab_count") .map(lambda x: np.max(x[1]))
plot_series(pd.Series(daily_max_tabs_per_user.collect()))
# ### Maximum Concurrent Window Count
# In[12]:
plot_histogram_scalar(all_pings, "browser.engagement.max_concurrent_window_count")
# What's the maximum number of concurrent windows each user has, per day?
# In[13]:
daily_max_wins_per_user = values_per_day(all_pings, "browser.engagement.max_concurrent_window_count") .map(lambda x: np.max(x[1]))
plot_series(pd.Series(daily_max_wins_per_user.collect()))
# ### Tab Open Event Count
# In[14]:
plot_histogram_scalar(all_pings, "browser.engagement.tab_open_event_count")
# How many tabs are being opened by each user, per day?
# In[15]:
daily_tab_opens_per_user = values_per_day(all_pings, "browser.engagement.tab_open_event_count") .map(lambda x: np.sum(x[1]))
plot_series(pd.Series(daily_tab_opens_per_user.collect()))
# Compare tab open event and the maximum tab count, over a subsession:
# * Get the maximum among all the fragments for the concurrent tabs
# * Sum the open events for each fragment
# In[16]:
def map_to_tab_measurements(p):
scalars = p["payload/processes/parent/scalars"]
max_cnt = scalars.get("browser.engagement.max_concurrent_tab_count", 0)
open_cnt = scalars.get("browser.engagement.tab_open_event_count", 0)
return ((p["meta/clientId"], p["payload/info/sessionId"]), (open_cnt, max_cnt))
per_session_tab = latest_pings.filter(lambda p: p["payload/processes/parent/scalars"]) .map(map_to_tab_measurements)
# In[17]:
combined_per_session_tab = per_session_tab.combineByKey(lambda x: x,
lambda acc, x: (acc[0] + x[0], max(acc[1], x[1])),
lambda x, y: (x[0] + y[0], max(x[1], y[1])))
# Plot and describe the number of tab open events per client session.
# In[18]:
per_session_tab_open_events = combined_per_session_tab.map(lambda x: x[1][0])
plot_series(pd.Series(per_session_tab_open_events.collect()))
# Plot and describe the maximum number of concurrent tabs per client session.
# In[19]:
per_session_max_tabs = combined_per_session_tab.map(lambda x: x[1][1])
plot_series(pd.Series(per_session_max_tabs.collect()))
# ### Window Open Event Count
# In[20]:
plot_histogram_scalar(all_pings, "browser.engagement.window_open_event_count")
# How many windows are being opened by the clients, per day?
# In[21]:
daily_win_opens_per_user = values_per_day(all_pings, "browser.engagement.window_open_event_count") .map(lambda x: np.sum(x[1]))
plot_series(pd.Series(daily_win_opens_per_user.collect()))
# Compare window open event and the maximum window count, over a subsession:
# * Get the maximum among all the fragments for the concurrent windows
# * Sum the open events for each fragment
# In[22]:
def map_to_win_measurements(p):
scalars = p["payload/processes/parent/scalars"]
max_cnt = scalars.get("browser.engagement.max_concurrent_window_count", 0)
open_cnt = scalars.get("browser.engagement.window_open_event_count", 0)
return ((p["meta/clientId"], p["payload/info/sessionId"]), (open_cnt, max_cnt))
per_session_win = latest_pings.filter(lambda p: p["payload/processes/parent/scalars"]) .map(map_to_win_measurements)
# In[23]:
combined_per_session_win = per_session_win.combineByKey(lambda x: x,
lambda acc, x: (acc[0] + x[0], max(acc[1], x[1])),
lambda x, y: (x[0] + y[0], max(x[1], y[1])))
# Plot and describe the number of window open events per client session.
# In[24]:
per_session_win_open_events = combined_per_session_win.map(lambda x: x[1][0])
plot_series(pd.Series(per_session_win_open_events.collect()))
# Plot and describe the number of maximum concurrent windows per client session.
# In[25]:
per_session_max_windows = combined_per_session_win.map(lambda x: x[1][1])
plot_series(pd.Series(per_session_max_windows.collect()))
# How many subsessions don't include a window open event? The statistics below point out that most of the subsessions have 0 window open events.
# In[26]:
subsessions_with_window_open = all_pings.filter(lambda p: p["payload/processes/parent/scalars"]) .map(lambda p: p["payload/processes/parent/scalars"] .get("browser.engagement.window_open_event_count", 0))
scalar_series = pd.Series(subsessions_with_window_open.collect())
plot_series(scalar_series)
# ### Dig deeper into the window open events.
# When analysing Beta data, we found that:
#
# * The 50% of **subsession** have two window open events.
# * The 50% of **full sessions** have *zero* window open events.
#
# It could be possible in an example like this (see [bug 1303044](https://bugzilla.mozilla.org/show_bug.cgi?id=1303044#c10) for context):
# - 90% of sessions have only one subsession, and those sessions also have 0 window open events.
# - The remaining 10% of sessions have 10 subsessions each, and each of those subsessions has two window open events
# In[27]:
def aggregate_subsession_openevents(p):
scalars = p["payload/processes/parent/scalars"]
# Create a tuple like ((client_id, session_id), (1, window_open_event)). We'll
# use the unit to ease the count of subsessions per session.
return ((p["meta/clientId"], p["payload/info/sessionId"]),
(1, scalars.get("browser.engagement.window_open_event_count", 0)))
# Compute, for each session, the number of window open events and the number of
# considered subsessions.
agg_openevent_ssc = latest_pings.filter(lambda p: p["payload/processes/parent/scalars"]) .map(aggregate_subsession_openevents) .combineByKey(lambda x: x,
lambda acc, x: (acc[0] + x[0], acc[1] + x[1]),
lambda x, y: (x[0] + y[0], x[1] + y[1]))
# We've got our aggregated data now: the number of subsessions and the number of window open event count per session. Group the open event count by subsession count.
# In[28]:
only_ssl_woe = agg_openevent_ssc.map(lambda r: r[1])
woe = only_ssl_woe .map(lambda x: (x[0], [x[1]])) .reduceByKey(lambda x,y: x + y)
# Then compute a representative value for each subsession group, i.e. the percentile value of all the window open event count for a particular number of subsessions.
# In[29]:
# Compute the 95 percentile for each subsession length.
woe_summary = woe.map(lambda r: (r[0], np.percentile(r[1], 95.0))).collect()
# Sort the data by the number of subessions, so we can
# plot a line.
woe_sorted = sorted(woe_summary, key=lambda x: x[0])
x_ss_per_session = [d[0] for d in woe_sorted]
y_woe_per_session = [d[1] for d in woe_sorted]
# In[31]:
subsession_count_series = pd.Series(only_ssl_woe.map(lambda r: r[0]).collect())
subsession_count_series.describe()
fig = plt.figure(figsize=(10, 6))
# Plot the number of window open vs number of subsessions
ax = fig.add_subplot(1, 2, 1)
ax.scatter(x_ss_per_session, y_woe_per_session, label='Data points')
#ax.plot(x_ss_per_session, reg_y, label='Regression Line', c='r')
ax.set_xlabel('Number of subsessions per session')
ax.set_ylabel('Number of window open events per session (95p)')
ax.legend()
# Plot the histogram of subsession length.
ax2 = fig.add_subplot(1, 2, 2)
subsession_count_series.hist(ax=ax2, bins=20, bottom=0.1)
ax2.set_xlabel('Number of subsessions per session')
ax2.set_ylabel('Frequency')
ax2.set_yscale('log')
plt.show()
# Let's dig into this a bit more and find how sessions with few subsessions behave.
# In[32]:
# Get the p95 percentile for the number of subsessions within sessions.
subsession_count_series.quantile([.75, .95, .99])
# The vast majority of subsession have 2 subsessions only. **What's the 95 percentile window open count for these subsession counts? **
# In[38]:
woe_summary_df = pd.DataFrame(woe_summary, columns=['No. Subsessions', 'Window Open Events (p95)'])
woe_summary_df[(woe_summary_df['No. Subsessions'] == 1) | (woe_summary_df['No. Subsessions'] == 2)]
# Compute the descriptive statistics to describe the window open event count for each session having 1 or 2 subsessions at most.
# In[39]:
from scipy import stats
woe_few_subsessions = woe.filter(lambda r: r[0] in [1, 2])
woe_few_subs_stats = woe_few_subsessions.map(lambda r: (r[0], stats.describe(r[1]))).collect()
# In[40]:
woe_few_subs_stats
# See how many sessions with 1 or 2 subsessions have a window open event count of 0, how many of 1, etc...
# In[41]:
woe_few_subsessions.flatMap(lambda r: r[1]).countByValue()
# ### Total count of URIs
# In[42]:
plot_histogram_scalar(latest_pings, "browser.engagement.total_uri_count")
# How many URIs are the clients opening, per day?
# In[43]:
daily_uris_per_user = values_per_day(latest_pings, "browser.engagement.total_uri_count") .map(lambda x: np.sum(x[1]))
plot_series(pd.Series(daily_uris_per_user.collect()))
# Take a look at the clients opening more than > 10k URIs per subsession.
# In[44]:
URI_THRESHOLD = 10000 # 10k uris
pings_many_uris = latest_pings.filter(lambda p: p["payload/processes/parent/scalars"]) .filter(lambda p: p["payload/processes/parent/scalars"].get("browser.engagement.total_uri_count", 0) > URI_THRESHOLD)
# What's the distribution of their subsession lengths?
# In[45]:
pings_many_uris_ssl = pings_many_uris.map(lambda p: p.get("payload/info/subsessionLength"))
plot_series(pd.Series(pings_many_uris_ssl.collect()), 15, 0)
# And what about their session lengths?
# In[46]:
pings_many_uris_sl = pings_many_uris.filter(lambda p: p.get("payload/info/reason") == "shutdown") .map(lambda p: p.get("payload/info/sessionLength"))
plot_series(pd.Series(pings_many_uris_sl.collect()), 10, 0)
# How many clients are acting like that?
# In[47]:
heavy_uri_loaders_clients = pings_many_uris.map(lambda p: p["meta/clientId"]).distinct().collect()
heavy_uri_loaders = len(heavy_uri_loaders_clients)
# In[48]:
print "Ratio of clients opening more than 10k URIs:\t{}" .format(pct(heavy_uri_loaders, total_clients))
# Do these clients always behave the same?
# In[49]:
uri_behaviour = latest_pings.filter(lambda p: p["meta/clientId"] in heavy_uri_loaders_clients) .filter(lambda p: p["payload/processes/parent/scalars"] and p["payload/processes/parent/scalars"].get("browser.engagement.total_uri_count", False)) .map(lambda p: (p["meta/clientId"], [ p["payload/processes/parent/scalars"].get("browser.engagement.total_uri_count", 0)])) .reduceByKey(lambda x,y: x + y)
# In[50]:
uri_behaviour_rdd = uri_behaviour.map(lambda x: (np.min(x[1]), np.max(x[1]), np.percentile(x[1], 75), np.percentile(x[1], 95), len(x[1])))
uri_behaviour_df = pd.DataFrame(uri_behaviour_rdd.collect())
uri_behaviour_df.columns = ["# URIs min", "# URIs max", "p75", "p95", "Samples"]
uri_behaviour_df
# Inspect other field to try to figure out if super high URI counts come from some automated instance of Firefox:
#
# * either a new or constantly resetting profile
# * low session counts (1 or "few", profileSubsessionCounter as a proxy?)
# * lower session lengths
# * submit high counts with each session.
# * Maybe a proxy is "for pathological clients, the uri counts p25 is pretty close to p90"?
# In[51]:
UNIX_EPOCH_DAY = datetime.datetime.utcfromtimestamp(0)
def get_session_info(p):
return {
"submissionDate": p.get("meta/submissionDate"),
"profileCreationDate": UNIX_EPOCH_DAY + datetime.timedelta(days=p.get("environment/profile/creationDate")),
"reason": p.get("payload/info/reason"),
"profileSubsessionCounter": p.get("payload/info/profileSubsessionCounter"),
"sessionLength": p.get("payload/info/sessionLength"),
"subsessionLength": p.get("payload/info/subsessionLength")
}
many_uris_session = pings_many_uris.map(get_session_info)
many_uris_session.count()
# In[52]:
pd.DataFrame(many_uris_session.collect())
# ### Total unique domains count
# In[53]:
plot_histogram_scalar(latest_pings, "browser.engagement.unique_domains_count")
# What's the percentage of session fragments which exactly recorded >= 100 unique domains?
# In[54]:
num_latest_pings = latest_pings.count()
num_latest_pings
# In[55]:
def hit_upper_domain_bound(p):
if not p["payload/processes/parent/scalars"]:
return False
domain_count = p["payload/processes/parent/scalars"].get("browser.engagement.unique_domains_count", False)
if domain_count is False:
return False
return domain_count >= 100 # 100 is the upper bound, we should never go beyond that.
pings_hitting_domain_bounds = latest_pings.filter(hit_upper_domain_bound)
domain_count_upper_bounds = pings_hitting_domain_bounds.count()
# In[56]:
print "Subsessions with >= unique domains: {} - {}".format(domain_count_upper_bounds,
pct(domain_count_upper_bounds, num_latest_pings))
# How many clients?
# In[57]:
clients_unique_domains = pings_hitting_domain_bounds.map(lambda p: p["meta/clientId"]).distinct().collect()
print "Ratio of clients opening more than 100 unique domains:\t{}" .format(pct(len(clients_unique_domains), total_clients))
# How many heavy URI loaders are also hitting the 100 unique domains?
# In[ ]:
len([clientId for clientId in heavy_uri_loaders_clients if clientId in clients_unique_domains])
# How many unique domains (maximum among all the session fragments) is each user visiting, per day?
# In[ ]:
daily_domains_per_user = values_per_day(latest_pings, "browser.engagement.unique_domains_count") .map(lambda x: np.max(x[1]))
plot_series(pd.Series(daily_domains_per_user.collect()))
# How many unique domains (p95 among all the session fragments) is each user visiting, per day?
# In[ ]:
daily_domains_per_user = values_per_day(latest_pings, "browser.engagement.unique_domains_count") .map(lambda x: np.percentile(x[1], 95))
plot_series(pd.Series(daily_domains_per_user.collect()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment