Dexterp37/Bug 1276200 - Validate engagement measurements.ipynb

## Bug 1276200 - Validate engagement measurements.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Bug 1276200 - Validate engagement measurements.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Bug 1276200 - Validate engagement measurements.py

# coding: utf-8

# ## Bug 1276200 - Validate engagement measurements

# In[1]:

import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import datetime as dt
from uuid import UUID

from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history

get_ipython().magic(u'pylab inline')


# We get two chunks of pings:
#  * broke_uri_pings: after the engagement measurement land (in bug 1271313) up to bug 1293222 which fixes the broken URI counts
#  * latest_pings: after bug 1293222 lands

# In[16]:

def dedupe(pings):
    return pings.map(lambda p: (p["meta/documentId"], p))                .reduceByKey(lambda a, b: a)                .values()

def filter(pings):
    subset =  get_pings_properties(pings, ["meta/clientId",
                                           "meta/documentId",
                                           "meta/submissionDate",
                                           "environment/profile/creationDate",
                                           "environment/profile/resetDate",
                                           "environment/build",
                                           "environment/partner",
                                           "environment/system",
                                           "payload/info/reason",
                                           "payload/info/sessionId",
                                           "payload/info/subsessionLength",
                                           "payload/info/sessionLength",
                                           "payload/info/profileSubsessionCounter",
                                           "payload/processes/parent/scalars"])
    return dedupe(subset)

broken_uri_pings = filter(get_pings(sc,
                            app="Firefox",
                            channel="nightly",
                            doc_type="main",
                            schema="v4",
                            submission_date=("20160808", "20160815"), # Only one week of submissions.
                            build_id=("20160722000000", "20160815000000"), # Up to bug 1293222
                            fraction=1.0))

latest_pings = filter(get_pings(sc,
                        app="Firefox",
                        channel="nightly",
                        doc_type="main",
                        schema="v4",
                        submission_date=("20160815", "20160822"),
                        build_id=("20160815000000", "20160822000000"), # Post bug 1293222
                        fraction=1.0))

all_pings = broken_uri_pings + latest_pings


# Make sure each ping has a scalar section and the contained engagament measurements scalar have the right formats.

# In[17]:

def engagement_measurements_check(p):
    known_engagement_scalars = {
        "browser.engagement.max_concurrent_tab_count": int,
        "browser.engagement.max_concurrent_window_count": int,
        "browser.engagement.tab_open_event_count": int,
        "browser.engagement.total_uri_count": int,
        "browser.engagement.unique_domains_count": int,
        "browser.engagement.window_open_event_count": int,
    }

    # We know these scalars must be there for the referenced timeframe.
    expected_scalars = [
        "browser.engagement.max_concurrent_tab_count",
        "browser.engagement.max_concurrent_window_count"
    ]

    scalars = p["payload/processes/parent/scalars"]

    if scalars is None:
        return ("scalars section is None", p)

    # We don't expect all the engagement measurements to be there but,
    # if they are, make sure they have the correct format.
    for k, v in known_engagement_scalars.iteritems():
        if k in scalars:
            if type(scalars[k]) != v:
                return ("wrong type: " + k, p)
            if scalars[k] < 0:
                return ("check failed: " + k + " < 1", p)

    # We're not expecting other scalars from these builds.
    for k in scalars:
        if k not in known_engagement_scalars:
            return ("unexpected scalar: " + k, p)

    for s in expected_scalars:
        if s not in scalars:
            return ("{} not reported".format(s), p)

    return ("", p)

checked_pings = all_pings.map(engagement_measurements_check)
result_counts = checked_pings.countByKey()
result_counts


# ### Let's dig into the pings with missing engagement measurements.

# In[5]:

latest_pings.map(engagement_measurements_check).countByKey()


# In[18]:

missing_eng = latest_pings.filter(lambda p: (p["payload/processes/parent/scalars"] != None) and                                            (len(p["payload/processes/parent/scalars"].keys()) == 0))


# How many clients are sending an empy scalars section?

# In[12]:

len(missing_eng.map(lambda p: p["meta/clientId"]).distinct().collect())


# Let's check the distribution of the subsession lengths.

# In[20]:

missing_ssl = missing_eng.map(lambda p: p["payload/info/subsessionLength"]).collect()
plot_series(pd.Series(missing_ssl), 30, 0)


# ### Helper functions for plotting and analysing.

# In[4]:

def plot_series(series, graph_bins=100, graph_min=0.1):
    # Plot to an histogram.
    fig, ax = plt.subplots()
    series.hist(ax=ax, bins=graph_bins, bottom=graph_min)
    ax.set_yscale('log')
    # Return some descriptive statistics.
    return series.describe(percentiles=[.5, .75, .95, .99, .995])

def plot_histogram_scalar(pings, scalar_name):
    scalar_values = pings.filter(lambda p: p["payload/processes/parent/scalars"] and p["payload/processes/parent/scalars"].get(scalar_name, False))                         .map(lambda p: p["payload/processes/parent/scalars"][scalar_name])
    scalar_series = pd.Series(scalar_values.collect())
    return plot_series(scalar_series)

def values_per_day(pings, scalar):
    # Each entry in the |daily_per_user| RDD is like:
    # (("date", "clientId"), [ ... scalar values for the client, date ... ])
    daily_per_user = pings.filter(lambda p: p["payload/processes/parent/scalars"] and                                      p["payload/processes/parent/scalars"].get(scalar, False))                          .map(lambda p: ((p["meta/submissionDate"], p["meta/clientId"]), [ p["payload/processes/parent/scalars"].get(scalar) ]))                          .reduceByKey(lambda a,b: a + b)
    return daily_per_user

def pct(a, b):
    return round(float(a) / b, 3)


# ### Maximum Concurrent Tab Count

# In[123]:

plot_histogram_scalar(all_pings, "browser.engagement.max_concurrent_tab_count")


# What's the maximum number of concurrent tabs each user has, per day?

# In[124]:

daily_max_tabs_per_user = values_per_day(all_pings, "browser.engagement.max_concurrent_tab_count")                             .map(lambda x: np.max(x[1]))
plot_series(pd.Series(daily_max_tabs_per_user.collect()))


# ### Maximum Concurrent Window Count

# In[125]:

plot_histogram_scalar(all_pings, "browser.engagement.max_concurrent_window_count")


# What's the maximum number of concurrent windows each user has, per day?

# In[126]:

daily_max_wins_per_user = values_per_day(all_pings, "browser.engagement.max_concurrent_window_count")                             .map(lambda x: np.max(x[1]))
plot_series(pd.Series(daily_max_wins_per_user.collect()))


# ### Tab Open Event Count

# In[127]:

plot_histogram_scalar(all_pings, "browser.engagement.tab_open_event_count")


# How many tabs are being opened by each user, per day?

# In[128]:

daily_tab_opens_per_user = values_per_day(all_pings, "browser.engagement.tab_open_event_count")                             .map(lambda x: np.sum(x[1]))
plot_series(pd.Series(daily_tab_opens_per_user.collect()))


# Compare tab open event and the maximum tab count, over a subsession:
# * Get the maximum among all the fragments for the concurrent tabs
# * Sum the open events for each fragment

# In[129]:

def map_to_tab_measurements(p):
    scalars = p["payload/processes/parent/scalars"]
    max_cnt = scalars.get("browser.engagement.max_concurrent_tab_count", 0)
    open_cnt = scalars.get("browser.engagement.tab_open_event_count", 0)
    return ((p["meta/clientId"], p["payload/info/sessionId"]), (open_cnt, max_cnt))

per_session_tab = latest_pings.filter(lambda p: p["payload/processes/parent/scalars"])                              .map(map_to_tab_measurements)


# In[130]:

combined_per_session_tab = per_session_tab.combineByKey(lambda x: x,
                                                        lambda acc, x: (acc[0] + x[0], max(acc[1], x[1])),
                                                        lambda x, y: (x[0] + y[0], max(x[1], y[1])))


# Plot and describe the number of tab open events per client session.

# In[131]:

per_session_tab_open_events = combined_per_session_tab.map(lambda x: x[1][0])
plot_series(pd.Series(per_session_tab_open_events.collect()))


# Plot and describe the maximum number of concurrent tabs per client session.

# In[132]:

per_session_max_tabs = combined_per_session_tab.map(lambda x: x[1][1])
plot_series(pd.Series(per_session_max_tabs.collect()))


# ### Window Open Event Count

# In[133]:

plot_histogram_scalar(all_pings, "browser.engagement.window_open_event_count")


# How many windows are being opened by the clients, per day?

# In[134]:

daily_win_opens_per_user = values_per_day(all_pings, "browser.engagement.window_open_event_count")                             .map(lambda x: np.sum(x[1]))
plot_series(pd.Series(daily_win_opens_per_user.collect()))


# Compare window open event and the maximum window count, over a subsession:
# * Get the maximum among all the fragments for the concurrent windows
# * Sum the open events for each fragment

# In[135]:

def map_to_win_measurements(p):
    scalars = p["payload/processes/parent/scalars"]
    max_cnt = scalars.get("browser.engagement.max_concurrent_window_count", 0)
    open_cnt = scalars.get("browser.engagement.window_open_event_count", 0)
    return ((p["meta/clientId"], p["payload/info/sessionId"]), (open_cnt, max_cnt))

per_session_win = latest_pings.filter(lambda p: p["payload/processes/parent/scalars"])                              .map(map_to_win_measurements)


# In[136]:

combined_per_session_win = per_session_win.combineByKey(lambda x: x,
                                                        lambda acc, x: (acc[0] + x[0], max(acc[1], x[1])),
                                                        lambda x, y: (x[0] + y[0], max(x[1], y[1])))


# Plot and describe the number of window open events per client session.

# In[137]:

per_session_win_open_events = combined_per_session_win.map(lambda x: x[1][0])
plot_series(pd.Series(per_session_win_open_events.collect()))


# Plot and describe the number of maximum concurrent windows per client session.

# In[138]:

per_session_max_windows = combined_per_session_win.map(lambda x: x[1][1])
plot_series(pd.Series(per_session_max_windows.collect()))


# How many subsessions don't include a window open event? The statistics below point out that most of the subsessions have 0 window open events.

# In[9]:

subsessions_with_window_open = all_pings.filter(lambda p: p["payload/processes/parent/scalars"])                                        .map(lambda p: p["payload/processes/parent/scalars"]                                             .get("browser.engagement.window_open_event_count", 0))

scalar_series = pd.Series(subsessions_with_window_open.collect())
plot_series(scalar_series)


# ### Total count of URIs

# In[139]:

plot_histogram_scalar(broken_uri_pings, "browser.engagement.total_uri_count")


# In[140]:

plot_histogram_scalar(latest_pings, "browser.engagement.total_uri_count")


# How many URIs are the clients opening, per day?

# In[141]:

daily_uris_per_user = values_per_day(latest_pings, "browser.engagement.total_uri_count")                          .map(lambda x: np.sum(x[1]))
plot_series(pd.Series(daily_uris_per_user.collect()))


# Take a look at the clients opening more than > 10k URIs per subsession.

# In[142]:

URI_THRESHOLD = 10000 # 10k uris
pings_many_uris = latest_pings.filter(lambda p: p["payload/processes/parent/scalars"])                              .filter(lambda p: p["payload/processes/parent/scalars"].get("browser.engagement.total_uri_count", 0) > URI_THRESHOLD)


# What's the distribution of their subsession lengths?

# In[143]:

pings_many_uris_ssl = pings_many_uris.map(lambda p: p.get("payload/info/subsessionLength"))
plot_series(pd.Series(pings_many_uris_ssl.collect()), 10, 0)


# And what about their session lengths?

# In[144]:

pings_many_uris_sl = pings_many_uris.filter(lambda p: p.get("payload/info/reason") == "shutdown")                                    .map(lambda p: p.get("payload/info/sessionLength"))
plot_series(pd.Series(pings_many_uris_sl.collect()), 10, 0)


# How many clients are acting like that?

# In[145]:

heavy_uri_loaders_clients = pings_many_uris.map(lambda p: p["meta/clientId"]).distinct().collect()
heavy_uri_loaders = len(heavy_uri_loaders_clients)
total_clients = latest_pings.map(lambda p: p["meta/clientId"]).distinct().count()


# In[146]:

print "{} clients ({}) are opening more than 10k URIs"    .format(heavy_uri_loaders, pct(heavy_uri_loaders, total_clients))


# Do these clients always behave the same?

# In[147]:

uri_behaviour = latest_pings.filter(lambda p: p["meta/clientId"] in heavy_uri_loaders_clients)                   .filter(lambda p: p["payload/processes/parent/scalars"] and                                      p["payload/processes/parent/scalars"].get("browser.engagement.total_uri_count", False))                   .map(lambda p: (p["meta/clientId"], [ p["payload/processes/parent/scalars"].get("browser.engagement.total_uri_count", 0)]))                   .reduceByKey(lambda x,y: x + y)


# In[148]:

uri_behaviour_rdd = uri_behaviour.map(lambda x: (np.min(x[1]),                                                 np.max(x[1]),                                                 np.percentile(x[1], 75),                                                 np.percentile(x[1], 95),                                                 len(x[1])))
uri_behaviour_df = pd.DataFrame(uri_rdd.collect())
uri_behaviour_df.columns = ["# URIs min", "# URIs max", "p75", "p95", "Samples"]
uri_behaviour_df


# Inspect other field to try to figure out if super high URI counts come from some automated instance of Firefox:
#
# * either a new or constantly resetting profile
# * low session counts (1 or "few", profileSubsessionCounter as a proxy?)
# * lower session lengths
# * submit high counts with each session.
# * Maybe a proxy is "for pathological clients, the uri counts p25 is pretty close to p90"?

# In[149]:

UNIX_EPOCH_DAY = datetime.datetime.utcfromtimestamp(0)

def get_session_info(p):
    return {
        "submissionDate": p.get("meta/submissionDate"),
        "profileCreationDate": UNIX_EPOCH_DAY + datetime.timedelta(days=p.get("environment/profile/creationDate")),
        "reason": p.get("payload/info/reason"),
        "profileSubsessionCounter": p.get("payload/info/profileSubsessionCounter"),
        "sessionLength": p.get("payload/info/sessionLength"),
        "subsessionLength": p.get("payload/info/subsessionLength")
    }

many_uris_session = pings_many_uris.map(get_session_info)
many_uris_session.count()


# In[150]:

pd.DataFrame(many_uris_session.collect())


# ### Total unique domains count

# In[151]:

plot_histogram_scalar(broken_uri_pings, "browser.engagement.unique_domains_count")


# In[152]:

plot_histogram_scalar(latest_pings, "browser.engagement.unique_domains_count")


# What's the percentage of session fragments which exactly recorded >= 100 unique domains?

# In[153]:

num_latest_pings = latest_pings.count()
num_latest_pings


# In[154]:

def hit_upper_domain_bound(p):
    if not p["payload/processes/parent/scalars"]:
        return False

    domain_count =         p["payload/processes/parent/scalars"].get("browser.engagement.unique_domains_count", False)
    if domain_count is False:
        return False

    return domain_count >= 100 # 100 is the upper bound, we should never go beyond that.

pings_hitting_domain_bounds = latest_pings.filter(hit_upper_domain_bound)
domain_count_upper_bounds = pings_hitting_domain_bounds.count()


# In[155]:

print "Subsessions with >= unique domains: {} - {}".format(domain_count_upper_bounds,
                                                           pct(domain_count_upper_bounds, num_latest_pings))


# How many clients?

# In[156]:

clients_unique_domains = pings_hitting_domain_bounds.map(lambda p: p["meta/clientId"]).distinct().collect()
print "{} clients ({}) are opening more than 100 unique domains"    .format(len(clients_unique_domains), pct(len(clients_unique_domains), total_clients))


# How many heavy URI loaders are also hitting the 100 unique domains?

# In[157]:

len([clientId for clientId in heavy_uri_loaders_clients if clientId in clients_unique_domains])


# How many unique domains (maximum among all the session fragments) is each user visiting, per day?

# In[158]:

daily_domains_per_user = values_per_day(latest_pings, "browser.engagement.unique_domains_count")                             .map(lambda x: np.max(x[1]))
plot_series(pd.Series(daily_domains_per_user.collect()))


# How many unique domains (p95 among all the session fragments) is each user visiting, per day?

# In[159]:

daily_domains_per_user = values_per_day(latest_pings, "browser.engagement.unique_domains_count")                             .map(lambda x: np.percentile(x[1], 95))
plot_series(pd.Series(daily_domains_per_user.collect()))


# # Data correlations

# This section tries to dive into the relationship between the previous results:
# * (1) 50% of users open at least 15 tabs per day
# * (2) 50% of users open at least 2 windows per day
# * (3) 50% of users open at least 13 URIs per day

# ### Can we explain the relationship between (1) & (3)?

# In[ ]:

	# coding: utf-8

	# ## Bug 1276200 - Validate engagement measurements

	# In[1]:

	import ujson as json
	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import plotly.plotly as py
	import datetime as dt
	from uuid import UUID

	from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history

	get_ipython().magic(u'pylab inline')


	# We get two chunks of pings:
	# * broke_uri_pings: after the engagement measurement land (in bug 1271313) up to bug 1293222 which fixes the broken URI counts
	# * latest_pings: after bug 1293222 lands

	# In[16]:

	def dedupe(pings):
	return pings.map(lambda p: (p["meta/documentId"], p)) .reduceByKey(lambda a, b: a) .values()

	def filter(pings):
	subset = get_pings_properties(pings, ["meta/clientId",
	"meta/documentId",
	"meta/submissionDate",
	"environment/profile/creationDate",
	"environment/profile/resetDate",
	"environment/build",
	"environment/partner",
	"environment/system",
	"payload/info/reason",
	"payload/info/sessionId",
	"payload/info/subsessionLength",
	"payload/info/sessionLength",
	"payload/info/profileSubsessionCounter",
	"payload/processes/parent/scalars"])
	return dedupe(subset)

	broken_uri_pings = filter(get_pings(sc,
	app="Firefox",
	channel="nightly",
	doc_type="main",
	schema="v4",
	submission_date=("20160808", "20160815"), # Only one week of submissions.
	build_id=("20160722000000", "20160815000000"), # Up to bug 1293222
	fraction=1.0))

	latest_pings = filter(get_pings(sc,
	app="Firefox",
	channel="nightly",
	doc_type="main",
	schema="v4",
	submission_date=("20160815", "20160822"),
	build_id=("20160815000000", "20160822000000"), # Post bug 1293222
	fraction=1.0))

	all_pings = broken_uri_pings + latest_pings


	# Make sure each ping has a scalar section and the contained engagament measurements scalar have the right formats.

	# In[17]:

	def engagement_measurements_check(p):
	known_engagement_scalars = {
	"browser.engagement.max_concurrent_tab_count": int,
	"browser.engagement.max_concurrent_window_count": int,
	"browser.engagement.tab_open_event_count": int,
	"browser.engagement.total_uri_count": int,
	"browser.engagement.unique_domains_count": int,
	"browser.engagement.window_open_event_count": int,
	}

	# We know these scalars must be there for the referenced timeframe.
	expected_scalars = [
	"browser.engagement.max_concurrent_tab_count",
	"browser.engagement.max_concurrent_window_count"
	]

	scalars = p["payload/processes/parent/scalars"]

	if scalars is None:
	return ("scalars section is None", p)

	# We don't expect all the engagement measurements to be there but,
	# if they are, make sure they have the correct format.
	for k, v in known_engagement_scalars.iteritems():
	if k in scalars:
	if type(scalars[k]) != v:
	return ("wrong type: " + k, p)
	if scalars[k] < 0:
	return ("check failed: " + k + " < 1", p)

	# We're not expecting other scalars from these builds.
	for k in scalars:
	if k not in known_engagement_scalars:
	return ("unexpected scalar: " + k, p)

	for s in expected_scalars:
	if s not in scalars:
	return ("{} not reported".format(s), p)

	return ("", p)

	checked_pings = all_pings.map(engagement_measurements_check)
	result_counts = checked_pings.countByKey()
	result_counts


	# ### Let's dig into the pings with missing engagement measurements.

	# In[5]:

	latest_pings.map(engagement_measurements_check).countByKey()


	# In[18]:

	missing_eng = latest_pings.filter(lambda p: (p["payload/processes/parent/scalars"] != None) and (len(p["payload/processes/parent/scalars"].keys()) == 0))


	# How many clients are sending an empy scalars section?

	# In[12]:

	len(missing_eng.map(lambda p: p["meta/clientId"]).distinct().collect())


	# Let's check the distribution of the subsession lengths.

	# In[20]:

	missing_ssl = missing_eng.map(lambda p: p["payload/info/subsessionLength"]).collect()
	plot_series(pd.Series(missing_ssl), 30, 0)


	# ### Helper functions for plotting and analysing.

	# In[4]:

	def plot_series(series, graph_bins=100, graph_min=0.1):
	# Plot to an histogram.
	fig, ax = plt.subplots()
	series.hist(ax=ax, bins=graph_bins, bottom=graph_min)
	ax.set_yscale('log')
	# Return some descriptive statistics.
	return series.describe(percentiles=[.5, .75, .95, .99, .995])

	def plot_histogram_scalar(pings, scalar_name):
	scalar_values = pings.filter(lambda p: p["payload/processes/parent/scalars"] and p["payload/processes/parent/scalars"].get(scalar_name, False)) .map(lambda p: p["payload/processes/parent/scalars"][scalar_name])
	scalar_series = pd.Series(scalar_values.collect())
	return plot_series(scalar_series)

	def values_per_day(pings, scalar):
	# Each entry in the \|daily_per_user\| RDD is like:
	# (("date", "clientId"), [ ... scalar values for the client, date ... ])
	daily_per_user = pings.filter(lambda p: p["payload/processes/parent/scalars"] and p["payload/processes/parent/scalars"].get(scalar, False)) .map(lambda p: ((p["meta/submissionDate"], p["meta/clientId"]), [ p["payload/processes/parent/scalars"].get(scalar) ])) .reduceByKey(lambda a,b: a + b)
	return daily_per_user

	def pct(a, b):
	return round(float(a) / b, 3)


	# ### Maximum Concurrent Tab Count

	# In[123]:

	plot_histogram_scalar(all_pings, "browser.engagement.max_concurrent_tab_count")


	# What's the maximum number of concurrent tabs each user has, per day?

	# In[124]:

	daily_max_tabs_per_user = values_per_day(all_pings, "browser.engagement.max_concurrent_tab_count") .map(lambda x: np.max(x[1]))
	plot_series(pd.Series(daily_max_tabs_per_user.collect()))


	# ### Maximum Concurrent Window Count

	# In[125]:

	plot_histogram_scalar(all_pings, "browser.engagement.max_concurrent_window_count")


	# What's the maximum number of concurrent windows each user has, per day?

	# In[126]:

	daily_max_wins_per_user = values_per_day(all_pings, "browser.engagement.max_concurrent_window_count") .map(lambda x: np.max(x[1]))
	plot_series(pd.Series(daily_max_wins_per_user.collect()))


	# ### Tab Open Event Count

	# In[127]:

	plot_histogram_scalar(all_pings, "browser.engagement.tab_open_event_count")


	# How many tabs are being opened by each user, per day?

	# In[128]:

	daily_tab_opens_per_user = values_per_day(all_pings, "browser.engagement.tab_open_event_count") .map(lambda x: np.sum(x[1]))
	plot_series(pd.Series(daily_tab_opens_per_user.collect()))


	# Compare tab open event and the maximum tab count, over a subsession:
	# * Get the maximum among all the fragments for the concurrent tabs
	# * Sum the open events for each fragment

	# In[129]:

	def map_to_tab_measurements(p):
	scalars = p["payload/processes/parent/scalars"]
	max_cnt = scalars.get("browser.engagement.max_concurrent_tab_count", 0)
	open_cnt = scalars.get("browser.engagement.tab_open_event_count", 0)
	return ((p["meta/clientId"], p["payload/info/sessionId"]), (open_cnt, max_cnt))

	per_session_tab = latest_pings.filter(lambda p: p["payload/processes/parent/scalars"]) .map(map_to_tab_measurements)


	# In[130]:

	combined_per_session_tab = per_session_tab.combineByKey(lambda x: x,
	lambda acc, x: (acc[0] + x[0], max(acc[1], x[1])),
	lambda x, y: (x[0] + y[0], max(x[1], y[1])))


	# Plot and describe the number of tab open events per client session.

	# In[131]:

	per_session_tab_open_events = combined_per_session_tab.map(lambda x: x[1][0])
	plot_series(pd.Series(per_session_tab_open_events.collect()))


	# Plot and describe the maximum number of concurrent tabs per client session.

	# In[132]:

	per_session_max_tabs = combined_per_session_tab.map(lambda x: x[1][1])
	plot_series(pd.Series(per_session_max_tabs.collect()))


	# ### Window Open Event Count

	# In[133]:

	plot_histogram_scalar(all_pings, "browser.engagement.window_open_event_count")


	# How many windows are being opened by the clients, per day?

	# In[134]:

	daily_win_opens_per_user = values_per_day(all_pings, "browser.engagement.window_open_event_count") .map(lambda x: np.sum(x[1]))
	plot_series(pd.Series(daily_win_opens_per_user.collect()))


	# Compare window open event and the maximum window count, over a subsession:
	# * Get the maximum among all the fragments for the concurrent windows
	# * Sum the open events for each fragment

	# In[135]:

	def map_to_win_measurements(p):
	scalars = p["payload/processes/parent/scalars"]
	max_cnt = scalars.get("browser.engagement.max_concurrent_window_count", 0)
	open_cnt = scalars.get("browser.engagement.window_open_event_count", 0)
	return ((p["meta/clientId"], p["payload/info/sessionId"]), (open_cnt, max_cnt))

	per_session_win = latest_pings.filter(lambda p: p["payload/processes/parent/scalars"]) .map(map_to_win_measurements)


	# In[136]:

	combined_per_session_win = per_session_win.combineByKey(lambda x: x,
	lambda acc, x: (acc[0] + x[0], max(acc[1], x[1])),
	lambda x, y: (x[0] + y[0], max(x[1], y[1])))


	# Plot and describe the number of window open events per client session.

	# In[137]:

	per_session_win_open_events = combined_per_session_win.map(lambda x: x[1][0])
	plot_series(pd.Series(per_session_win_open_events.collect()))


	# Plot and describe the number of maximum concurrent windows per client session.

	# In[138]:

	per_session_max_windows = combined_per_session_win.map(lambda x: x[1][1])
	plot_series(pd.Series(per_session_max_windows.collect()))


	# How many subsessions don't include a window open event? The statistics below point out that most of the subsessions have 0 window open events.

	# In[9]:

	subsessions_with_window_open = all_pings.filter(lambda p: p["payload/processes/parent/scalars"]) .map(lambda p: p["payload/processes/parent/scalars"] .get("browser.engagement.window_open_event_count", 0))

	scalar_series = pd.Series(subsessions_with_window_open.collect())
	plot_series(scalar_series)


	# ### Total count of URIs

	# In[139]:

	plot_histogram_scalar(broken_uri_pings, "browser.engagement.total_uri_count")


	# In[140]:

	plot_histogram_scalar(latest_pings, "browser.engagement.total_uri_count")


	# How many URIs are the clients opening, per day?

	# In[141]:

	daily_uris_per_user = values_per_day(latest_pings, "browser.engagement.total_uri_count") .map(lambda x: np.sum(x[1]))
	plot_series(pd.Series(daily_uris_per_user.collect()))


	# Take a look at the clients opening more than > 10k URIs per subsession.

	# In[142]:

	URI_THRESHOLD = 10000 # 10k uris
	pings_many_uris = latest_pings.filter(lambda p: p["payload/processes/parent/scalars"]) .filter(lambda p: p["payload/processes/parent/scalars"].get("browser.engagement.total_uri_count", 0) > URI_THRESHOLD)


	# What's the distribution of their subsession lengths?

	# In[143]:

	pings_many_uris_ssl = pings_many_uris.map(lambda p: p.get("payload/info/subsessionLength"))
	plot_series(pd.Series(pings_many_uris_ssl.collect()), 10, 0)


	# And what about their session lengths?

	# In[144]:

	pings_many_uris_sl = pings_many_uris.filter(lambda p: p.get("payload/info/reason") == "shutdown") .map(lambda p: p.get("payload/info/sessionLength"))
	plot_series(pd.Series(pings_many_uris_sl.collect()), 10, 0)


	# How many clients are acting like that?

	# In[145]:

	heavy_uri_loaders_clients = pings_many_uris.map(lambda p: p["meta/clientId"]).distinct().collect()
	heavy_uri_loaders = len(heavy_uri_loaders_clients)
	total_clients = latest_pings.map(lambda p: p["meta/clientId"]).distinct().count()


	# In[146]:

	print "{} clients ({}) are opening more than 10k URIs" .format(heavy_uri_loaders, pct(heavy_uri_loaders, total_clients))


	# Do these clients always behave the same?

	# In[147]:

	uri_behaviour = latest_pings.filter(lambda p: p["meta/clientId"] in heavy_uri_loaders_clients) .filter(lambda p: p["payload/processes/parent/scalars"] and p["payload/processes/parent/scalars"].get("browser.engagement.total_uri_count", False)) .map(lambda p: (p["meta/clientId"], [ p["payload/processes/parent/scalars"].get("browser.engagement.total_uri_count", 0)])) .reduceByKey(lambda x,y: x + y)


	# In[148]:

	uri_behaviour_rdd = uri_behaviour.map(lambda x: (np.min(x[1]), np.max(x[1]), np.percentile(x[1], 75), np.percentile(x[1], 95), len(x[1])))
	uri_behaviour_df = pd.DataFrame(uri_rdd.collect())
	uri_behaviour_df.columns = ["# URIs min", "# URIs max", "p75", "p95", "Samples"]
	uri_behaviour_df


	# Inspect other field to try to figure out if super high URI counts come from some automated instance of Firefox:
	#
	# * either a new or constantly resetting profile
	# * low session counts (1 or "few", profileSubsessionCounter as a proxy?)
	# * lower session lengths
	# * submit high counts with each session.
	# * Maybe a proxy is "for pathological clients, the uri counts p25 is pretty close to p90"?

	# In[149]:

	UNIX_EPOCH_DAY = datetime.datetime.utcfromtimestamp(0)

	def get_session_info(p):
	return {
	"submissionDate": p.get("meta/submissionDate"),
	"profileCreationDate": UNIX_EPOCH_DAY + datetime.timedelta(days=p.get("environment/profile/creationDate")),
	"reason": p.get("payload/info/reason"),
	"profileSubsessionCounter": p.get("payload/info/profileSubsessionCounter"),
	"sessionLength": p.get("payload/info/sessionLength"),
	"subsessionLength": p.get("payload/info/subsessionLength")
	}

	many_uris_session = pings_many_uris.map(get_session_info)
	many_uris_session.count()


	# In[150]:

	pd.DataFrame(many_uris_session.collect())


	# ### Total unique domains count

	# In[151]:

	plot_histogram_scalar(broken_uri_pings, "browser.engagement.unique_domains_count")


	# In[152]:

	plot_histogram_scalar(latest_pings, "browser.engagement.unique_domains_count")


	# What's the percentage of session fragments which exactly recorded >= 100 unique domains?

	# In[153]:

	num_latest_pings = latest_pings.count()
	num_latest_pings


	# In[154]:

	def hit_upper_domain_bound(p):
	if not p["payload/processes/parent/scalars"]:
	return False

	domain_count = p["payload/processes/parent/scalars"].get("browser.engagement.unique_domains_count", False)
	if domain_count is False:
	return False

	return domain_count >= 100 # 100 is the upper bound, we should never go beyond that.

	pings_hitting_domain_bounds = latest_pings.filter(hit_upper_domain_bound)
	domain_count_upper_bounds = pings_hitting_domain_bounds.count()


	# In[155]:

	print "Subsessions with >= unique domains: {} - {}".format(domain_count_upper_bounds,
	pct(domain_count_upper_bounds, num_latest_pings))


	# How many clients?

	# In[156]:

	clients_unique_domains = pings_hitting_domain_bounds.map(lambda p: p["meta/clientId"]).distinct().collect()
	print "{} clients ({}) are opening more than 100 unique domains" .format(len(clients_unique_domains), pct(len(clients_unique_domains), total_clients))


	# How many heavy URI loaders are also hitting the 100 unique domains?

	# In[157]:

	len([clientId for clientId in heavy_uri_loaders_clients if clientId in clients_unique_domains])


	# How many unique domains (maximum among all the session fragments) is each user visiting, per day?

	# In[158]:

	daily_domains_per_user = values_per_day(latest_pings, "browser.engagement.unique_domains_count") .map(lambda x: np.max(x[1]))
	plot_series(pd.Series(daily_domains_per_user.collect()))


	# How many unique domains (p95 among all the session fragments) is each user visiting, per day?

	# In[159]:

	daily_domains_per_user = values_per_day(latest_pings, "browser.engagement.unique_domains_count") .map(lambda x: np.percentile(x[1], 95))
	plot_series(pd.Series(daily_domains_per_user.collect()))


	# # Data correlations

	# This section tries to dive into the relationship between the previous results:
	# * (1) 50% of users open at least 15 tabs per day
	# * (2) 50% of users open at least 2 windows per day
	# * (3) 50% of users open at least 13 URIs per day

	# ### Can we explain the relationship between (1) & (3)?

	# In[ ]: