Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Bug 1276200 - Validate engagement measurements
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# ## Bug 1276200 - Validate engagement measurements
# In[1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import datetime as dt
from uuid import UUID
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history
get_ipython().magic(u'pylab inline')
# We get two chunks of pings:
# * broke_uri_pings: after the engagement measurement land (in bug 1271313) up to bug 1293222 which fixes the broken URI counts
# * latest_pings: after bug 1293222 lands
# In[16]:
def dedupe(pings):
return pings.map(lambda p: (p["meta/documentId"], p)) .reduceByKey(lambda a, b: a) .values()
def filter(pings):
subset = get_pings_properties(pings, ["meta/clientId",
"meta/documentId",
"meta/submissionDate",
"environment/profile/creationDate",
"environment/profile/resetDate",
"environment/build",
"environment/partner",
"environment/system",
"payload/info/reason",
"payload/info/sessionId",
"payload/info/subsessionLength",
"payload/info/sessionLength",
"payload/info/profileSubsessionCounter",
"payload/processes/parent/scalars"])
return dedupe(subset)
broken_uri_pings = filter(get_pings(sc,
app="Firefox",
channel="nightly",
doc_type="main",
schema="v4",
submission_date=("20160808", "20160815"), # Only one week of submissions.
build_id=("20160722000000", "20160815000000"), # Up to bug 1293222
fraction=1.0))
latest_pings = filter(get_pings(sc,
app="Firefox",
channel="nightly",
doc_type="main",
schema="v4",
submission_date=("20160815", "20160822"),
build_id=("20160815000000", "20160822000000"), # Post bug 1293222
fraction=1.0))
all_pings = broken_uri_pings + latest_pings
# Make sure each ping has a scalar section and the contained engagament measurements scalar have the right formats.
# In[17]:
def engagement_measurements_check(p):
known_engagement_scalars = {
"browser.engagement.max_concurrent_tab_count": int,
"browser.engagement.max_concurrent_window_count": int,
"browser.engagement.tab_open_event_count": int,
"browser.engagement.total_uri_count": int,
"browser.engagement.unique_domains_count": int,
"browser.engagement.window_open_event_count": int,
}
# We know these scalars must be there for the referenced timeframe.
expected_scalars = [
"browser.engagement.max_concurrent_tab_count",
"browser.engagement.max_concurrent_window_count"
]
scalars = p["payload/processes/parent/scalars"]
if scalars is None:
return ("scalars section is None", p)
# We don't expect all the engagement measurements to be there but,
# if they are, make sure they have the correct format.
for k, v in known_engagement_scalars.iteritems():
if k in scalars:
if type(scalars[k]) != v:
return ("wrong type: " + k, p)
if scalars[k] < 0:
return ("check failed: " + k + " < 1", p)
# We're not expecting other scalars from these builds.
for k in scalars:
if k not in known_engagement_scalars:
return ("unexpected scalar: " + k, p)
for s in expected_scalars:
if s not in scalars:
return ("{} not reported".format(s), p)
return ("", p)
checked_pings = all_pings.map(engagement_measurements_check)
result_counts = checked_pings.countByKey()
result_counts
# ### Let's dig into the pings with missing engagement measurements.
# In[5]:
latest_pings.map(engagement_measurements_check).countByKey()
# In[18]:
missing_eng = latest_pings.filter(lambda p: (p["payload/processes/parent/scalars"] != None) and (len(p["payload/processes/parent/scalars"].keys()) == 0))
# How many clients are sending an empy scalars section?
# In[12]:
len(missing_eng.map(lambda p: p["meta/clientId"]).distinct().collect())
# Let's check the distribution of the subsession lengths.
# In[20]:
missing_ssl = missing_eng.map(lambda p: p["payload/info/subsessionLength"]).collect()
plot_series(pd.Series(missing_ssl), 30, 0)
# ### Helper functions for plotting and analysing.
# In[4]:
def plot_series(series, graph_bins=100, graph_min=0.1):
# Plot to an histogram.
fig, ax = plt.subplots()
series.hist(ax=ax, bins=graph_bins, bottom=graph_min)
ax.set_yscale('log')
# Return some descriptive statistics.
return series.describe(percentiles=[.5, .75, .95, .99, .995])
def plot_histogram_scalar(pings, scalar_name):
scalar_values = pings.filter(lambda p: p["payload/processes/parent/scalars"] and p["payload/processes/parent/scalars"].get(scalar_name, False)) .map(lambda p: p["payload/processes/parent/scalars"][scalar_name])
scalar_series = pd.Series(scalar_values.collect())
return plot_series(scalar_series)
def values_per_day(pings, scalar):
# Each entry in the |daily_per_user| RDD is like:
# (("date", "clientId"), [ ... scalar values for the client, date ... ])
daily_per_user = pings.filter(lambda p: p["payload/processes/parent/scalars"] and p["payload/processes/parent/scalars"].get(scalar, False)) .map(lambda p: ((p["meta/submissionDate"], p["meta/clientId"]), [ p["payload/processes/parent/scalars"].get(scalar) ])) .reduceByKey(lambda a,b: a + b)
return daily_per_user
def pct(a, b):
return round(float(a) / b, 3)
# ### Maximum Concurrent Tab Count
# In[123]:
plot_histogram_scalar(all_pings, "browser.engagement.max_concurrent_tab_count")
# What's the maximum number of concurrent tabs each user has, per day?
# In[124]:
daily_max_tabs_per_user = values_per_day(all_pings, "browser.engagement.max_concurrent_tab_count") .map(lambda x: np.max(x[1]))
plot_series(pd.Series(daily_max_tabs_per_user.collect()))
# ### Maximum Concurrent Window Count
# In[125]:
plot_histogram_scalar(all_pings, "browser.engagement.max_concurrent_window_count")
# What's the maximum number of concurrent windows each user has, per day?
# In[126]:
daily_max_wins_per_user = values_per_day(all_pings, "browser.engagement.max_concurrent_window_count") .map(lambda x: np.max(x[1]))
plot_series(pd.Series(daily_max_wins_per_user.collect()))
# ### Tab Open Event Count
# In[127]:
plot_histogram_scalar(all_pings, "browser.engagement.tab_open_event_count")
# How many tabs are being opened by each user, per day?
# In[128]:
daily_tab_opens_per_user = values_per_day(all_pings, "browser.engagement.tab_open_event_count") .map(lambda x: np.sum(x[1]))
plot_series(pd.Series(daily_tab_opens_per_user.collect()))
# Compare tab open event and the maximum tab count, over a subsession:
# * Get the maximum among all the fragments for the concurrent tabs
# * Sum the open events for each fragment
# In[129]:
def map_to_tab_measurements(p):
scalars = p["payload/processes/parent/scalars"]
max_cnt = scalars.get("browser.engagement.max_concurrent_tab_count", 0)
open_cnt = scalars.get("browser.engagement.tab_open_event_count", 0)
return ((p["meta/clientId"], p["payload/info/sessionId"]), (open_cnt, max_cnt))
per_session_tab = latest_pings.filter(lambda p: p["payload/processes/parent/scalars"]) .map(map_to_tab_measurements)
# In[130]:
combined_per_session_tab = per_session_tab.combineByKey(lambda x: x,
lambda acc, x: (acc[0] + x[0], max(acc[1], x[1])),
lambda x, y: (x[0] + y[0], max(x[1], y[1])))
# Plot and describe the number of tab open events per client session.
# In[131]:
per_session_tab_open_events = combined_per_session_tab.map(lambda x: x[1][0])
plot_series(pd.Series(per_session_tab_open_events.collect()))
# Plot and describe the maximum number of concurrent tabs per client session.
# In[132]:
per_session_max_tabs = combined_per_session_tab.map(lambda x: x[1][1])
plot_series(pd.Series(per_session_max_tabs.collect()))
# ### Window Open Event Count
# In[133]:
plot_histogram_scalar(all_pings, "browser.engagement.window_open_event_count")
# How many windows are being opened by the clients, per day?
# In[134]:
daily_win_opens_per_user = values_per_day(all_pings, "browser.engagement.window_open_event_count") .map(lambda x: np.sum(x[1]))
plot_series(pd.Series(daily_win_opens_per_user.collect()))
# Compare window open event and the maximum window count, over a subsession:
# * Get the maximum among all the fragments for the concurrent windows
# * Sum the open events for each fragment
# In[135]:
def map_to_win_measurements(p):
scalars = p["payload/processes/parent/scalars"]
max_cnt = scalars.get("browser.engagement.max_concurrent_window_count", 0)
open_cnt = scalars.get("browser.engagement.window_open_event_count", 0)
return ((p["meta/clientId"], p["payload/info/sessionId"]), (open_cnt, max_cnt))
per_session_win = latest_pings.filter(lambda p: p["payload/processes/parent/scalars"]) .map(map_to_win_measurements)
# In[136]:
combined_per_session_win = per_session_win.combineByKey(lambda x: x,
lambda acc, x: (acc[0] + x[0], max(acc[1], x[1])),
lambda x, y: (x[0] + y[0], max(x[1], y[1])))
# Plot and describe the number of window open events per client session.
# In[137]:
per_session_win_open_events = combined_per_session_win.map(lambda x: x[1][0])
plot_series(pd.Series(per_session_win_open_events.collect()))
# Plot and describe the number of maximum concurrent windows per client session.
# In[138]:
per_session_max_windows = combined_per_session_win.map(lambda x: x[1][1])
plot_series(pd.Series(per_session_max_windows.collect()))
# How many subsessions don't include a window open event? The statistics below point out that most of the subsessions have 0 window open events.
# In[9]:
subsessions_with_window_open = all_pings.filter(lambda p: p["payload/processes/parent/scalars"]) .map(lambda p: p["payload/processes/parent/scalars"] .get("browser.engagement.window_open_event_count", 0))
scalar_series = pd.Series(subsessions_with_window_open.collect())
plot_series(scalar_series)
# ### Total count of URIs
# In[139]:
plot_histogram_scalar(broken_uri_pings, "browser.engagement.total_uri_count")
# In[140]:
plot_histogram_scalar(latest_pings, "browser.engagement.total_uri_count")
# How many URIs are the clients opening, per day?
# In[141]:
daily_uris_per_user = values_per_day(latest_pings, "browser.engagement.total_uri_count") .map(lambda x: np.sum(x[1]))
plot_series(pd.Series(daily_uris_per_user.collect()))
# Take a look at the clients opening more than > 10k URIs per subsession.
# In[142]:
URI_THRESHOLD = 10000 # 10k uris
pings_many_uris = latest_pings.filter(lambda p: p["payload/processes/parent/scalars"]) .filter(lambda p: p["payload/processes/parent/scalars"].get("browser.engagement.total_uri_count", 0) > URI_THRESHOLD)
# What's the distribution of their subsession lengths?
# In[143]:
pings_many_uris_ssl = pings_many_uris.map(lambda p: p.get("payload/info/subsessionLength"))
plot_series(pd.Series(pings_many_uris_ssl.collect()), 10, 0)
# And what about their session lengths?
# In[144]:
pings_many_uris_sl = pings_many_uris.filter(lambda p: p.get("payload/info/reason") == "shutdown") .map(lambda p: p.get("payload/info/sessionLength"))
plot_series(pd.Series(pings_many_uris_sl.collect()), 10, 0)
# How many clients are acting like that?
# In[145]:
heavy_uri_loaders_clients = pings_many_uris.map(lambda p: p["meta/clientId"]).distinct().collect()
heavy_uri_loaders = len(heavy_uri_loaders_clients)
total_clients = latest_pings.map(lambda p: p["meta/clientId"]).distinct().count()
# In[146]:
print "{} clients ({}) are opening more than 10k URIs" .format(heavy_uri_loaders, pct(heavy_uri_loaders, total_clients))
# Do these clients always behave the same?
# In[147]:
uri_behaviour = latest_pings.filter(lambda p: p["meta/clientId"] in heavy_uri_loaders_clients) .filter(lambda p: p["payload/processes/parent/scalars"] and p["payload/processes/parent/scalars"].get("browser.engagement.total_uri_count", False)) .map(lambda p: (p["meta/clientId"], [ p["payload/processes/parent/scalars"].get("browser.engagement.total_uri_count", 0)])) .reduceByKey(lambda x,y: x + y)
# In[148]:
uri_behaviour_rdd = uri_behaviour.map(lambda x: (np.min(x[1]), np.max(x[1]), np.percentile(x[1], 75), np.percentile(x[1], 95), len(x[1])))
uri_behaviour_df = pd.DataFrame(uri_rdd.collect())
uri_behaviour_df.columns = ["# URIs min", "# URIs max", "p75", "p95", "Samples"]
uri_behaviour_df
# Inspect other field to try to figure out if super high URI counts come from some automated instance of Firefox:
#
# * either a new or constantly resetting profile
# * low session counts (1 or "few", profileSubsessionCounter as a proxy?)
# * lower session lengths
# * submit high counts with each session.
# * Maybe a proxy is "for pathological clients, the uri counts p25 is pretty close to p90"?
# In[149]:
UNIX_EPOCH_DAY = datetime.datetime.utcfromtimestamp(0)
def get_session_info(p):
return {
"submissionDate": p.get("meta/submissionDate"),
"profileCreationDate": UNIX_EPOCH_DAY + datetime.timedelta(days=p.get("environment/profile/creationDate")),
"reason": p.get("payload/info/reason"),
"profileSubsessionCounter": p.get("payload/info/profileSubsessionCounter"),
"sessionLength": p.get("payload/info/sessionLength"),
"subsessionLength": p.get("payload/info/subsessionLength")
}
many_uris_session = pings_many_uris.map(get_session_info)
many_uris_session.count()
# In[150]:
pd.DataFrame(many_uris_session.collect())
# ### Total unique domains count
# In[151]:
plot_histogram_scalar(broken_uri_pings, "browser.engagement.unique_domains_count")
# In[152]:
plot_histogram_scalar(latest_pings, "browser.engagement.unique_domains_count")
# What's the percentage of session fragments which exactly recorded >= 100 unique domains?
# In[153]:
num_latest_pings = latest_pings.count()
num_latest_pings
# In[154]:
def hit_upper_domain_bound(p):
if not p["payload/processes/parent/scalars"]:
return False
domain_count = p["payload/processes/parent/scalars"].get("browser.engagement.unique_domains_count", False)
if domain_count is False:
return False
return domain_count >= 100 # 100 is the upper bound, we should never go beyond that.
pings_hitting_domain_bounds = latest_pings.filter(hit_upper_domain_bound)
domain_count_upper_bounds = pings_hitting_domain_bounds.count()
# In[155]:
print "Subsessions with >= unique domains: {} - {}".format(domain_count_upper_bounds,
pct(domain_count_upper_bounds, num_latest_pings))
# How many clients?
# In[156]:
clients_unique_domains = pings_hitting_domain_bounds.map(lambda p: p["meta/clientId"]).distinct().collect()
print "{} clients ({}) are opening more than 100 unique domains" .format(len(clients_unique_domains), pct(len(clients_unique_domains), total_clients))
# How many heavy URI loaders are also hitting the 100 unique domains?
# In[157]:
len([clientId for clientId in heavy_uri_loaders_clients if clientId in clients_unique_domains])
# How many unique domains (maximum among all the session fragments) is each user visiting, per day?
# In[158]:
daily_domains_per_user = values_per_day(latest_pings, "browser.engagement.unique_domains_count") .map(lambda x: np.max(x[1]))
plot_series(pd.Series(daily_domains_per_user.collect()))
# How many unique domains (p95 among all the session fragments) is each user visiting, per day?
# In[159]:
daily_domains_per_user = values_per_day(latest_pings, "browser.engagement.unique_domains_count") .map(lambda x: np.percentile(x[1], 95))
plot_series(pd.Series(daily_domains_per_user.collect()))
# # Data correlations
# This section tries to dive into the relationship between the previous results:
# * (1) 50% of users open at least 15 tabs per day
# * (2) 50% of users open at least 2 windows per day
# * (3) 50% of users open at least 13 URIs per day
# ### Can we explain the relationship between (1) & (3)?
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment