bsmedberg/tab-usage-release-users.ipynb

## tab-usage-release-users.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              tab-usage-release-users.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## tab-usage-release-users.py

# coding: utf-8

# # Tab/Pageload Behavior of Firefox Release Users
#
# Author: bsmedberg@mozilla.com
#
# Date: 8-May-2017
#
# Status: Partially-reviewed draft.

# In[1]:

import plotly.plotly as py
from plotly.graph_objs import *
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions
import numpy as np


# In[2]:

main_summary = spark.read.option("mergeSchema", "true")    .parquet('s3://telemetry-parquet/main_summary/v3/')    .createOrReplaceTempView("main_summary")


# In[37]:

sample_unfiltered = spark.sql("""
    SELECT
        client_id,
        SUM(CASE WHEN app_version < '52' THEN 1 ELSE 0 END) AS bad_version,
        count(1) as subsession_count,
        max(max_concurrent_tab_count) as max_concurrent_tab_count,
        sum(tab_open_event_count) as tab_open_event_count,
        SUM(CASE WHEN subsession_length < 0 OR subsession_length > 172800 THEN 1 ELSE 0 END) AS bad_length,
        sum(subsession_length) as subsession_length_sum,
        sum(total_uri_count) as total_uri_count,
        SUM(CASE WHEN reason = 'aborted-session' THEN 1 ELSE 0 END) AS aborted_sessions,
        SUM(CASE WHEN reason = 'shutdown' THEN 1 ELSE 0 END) AS shutdown_session,
        SUM(CASE WHEN reason = 'environment-change' THEN 1 ELSE 0 END) AS environment_session,
        SUM(CASE WHEN reason = 'daily' THEN 1 ELSE 0 END) AS daily_session
    FROM main_summary
    WHERE
          submission_date_s3 >= '20170415' AND submission_date_s3 <= '20170429'
      AND subsession_start_date >= '2017-04-16' AND subsession_start_date < '2017-04-23'
    GROUP BY 1
    HAVING bad_version = 0 AND bad_length = 0
    """
).cache()


# In[38]:

sample_unfiltered.count()


# # Discarding severe outliers
#
# When defining heavy users in terms of total pageloads, we can suffer from bots and other unreal situations that inflate pageloads. Discard anyone who loads more than a page every 10 seconds or more than 30000/week (because WTF).

# In[39]:

total_uri_count_unfiltered = sample_unfiltered.select(pyspark.sql.functions.sum(sample_unfiltered.total_uri_count)).first()[0]
total_uri_count_unfiltered


# In[40]:

outliers = sample_unfiltered.where((sample_unfiltered.total_uri_count > 30000) | (sample_unfiltered.subsession_length_sum / sample_unfiltered.total_uri_count < 10))
outliers_uri_count = outliers.select(pyspark.sql.functions.sum(outliers.total_uri_count)).first()[0]

print "total outliers: {}".format(outliers.count())
print "total URIs loaded by outliers: {} ({:.0f}%)".format(outliers.count(), 100.0 * outliers_uri_count / total_uri_count_unfiltered)
print "example outliers:"
outliers.select("subsession_count", "max_concurrent_tab_count", "tab_open_event_count", "subsession_length_sum", "total_uri_count").sample(False, 0.001).toPandas()


# In[61]:

# how many users just have >30000 uri loads?
sample_unfiltered.where(sample_unfiltered.total_uri_count > 30000).count()


# In[41]:

sample = sample_unfiltered.where((sample_unfiltered.total_uri_count <= 30000) & (sample_unfiltered.subsession_length_sum / sample_unfiltered.total_uri_count >= 10))
sample.createOrReplaceTempView("sample")


# ## Heavy users
#
# There are multiple definitions of a heavy user being proposed. We can't calculate heavy users by activeTicks as proposed at https://docs.google.com/a/mozilla.com/document/d/1TJUo1dbTA5mcWg5ZznKqYn3mpSqQeaDlqCRzUsi0_Js/edit?usp=sharing because this dataset doesn't yet have activeTicks.
#
# Instead, this calculates it by pageloads: at what threshold of pageloads (total_uri_count) do we find the users who are responsible for 80% of all pageloads?

# In[42]:

total_users = sample.count()
total_uri_count = sample.select(pyspark.sql.functions.sum(sample.total_uri_count)).first()[0]


# In[43]:

ranked_by_uri = spark.sql("""
    SELECT
        total_uri_count,
        RANK() OVER (ORDER BY total_uri_count) AS rank
    FROM sample
    ORDER BY total_uri_count
""").cache()


# In[44]:

def top_npct_uris(n):
    topn = ranked_by_uri.where(ranked_by_uri.rank > total_users * (100.0 - n) / 100)
    cutoff = topn.first().total_uri_count
    uri_count = topn.select(pyspark.sql.functions.sum(sample.total_uri_count)).first()[0]
    print "The top {}% of users accounted for {:.0f}% of total URI loads. (cutoff={})".format(n, 100.0 * uri_count / total_uri_count, cutoff)
    return cutoff
top_npct_uris(1)
top_npct_uris(5)
heavy_user_cutoff = top_npct_uris(10)
_ = top_npct_uris(30)


# For the purposes of this analysis and as suggested by bcolloran, we're going to look at the top 10% of users as "heavy users".

# In[45]:

print "Heavy users are users who loaded more than {} pages this week.".format(heavy_user_cutoff)


# ## Do page loads distribute differently than usage hours
#
# bcolloran did a previous analysis of the distribution of session hours and active ticks. We don't have active ticks yet, but let's calculate the distribution of usage hours to double-check our data and repeat the previous analysis

# In[46]:

total_session_length = sample.select(pyspark.sql.functions.sum(sample.subsession_length_sum)).first()[0]


# In[47]:

ranked_by_session_length = spark.sql("""
    SELECT
        subsession_length_sum,
        RANK() OVER (ORDER BY subsession_length_sum) AS rank
    FROM sample
    ORDER BY subsession_length_sum
""").cache()


# In[48]:

def top_npct_length(n):
    topn = ranked_by_session_length.where(ranked_by_session_length.rank > total_users * (100.0 - n) / 100)
    cutoff = topn.first().subsession_length_sum
    length_sum = topn.select(pyspark.sql.functions.sum(sample.subsession_length_sum)).first()[0]
    print "The top {}% of users accounted for {:.0f}% of total session hours. (cutoff={:.1f} hours)".format(n, 100.0 * length_sum / total_session_length, cutoff / 60.0 / 60.0)

top_npct_length(1)
top_npct_length(5)
top_npct_length(10)
top_npct_length(30)


# ## Comparing light and heavy user behavior
#
# The following charts compare the behavior of heavy users and normal users.

# In[49]:

def iter_with_next(l):
    "Generate (first, next) pairs for a list (skipping the last item)."
    i = iter(l)
    prev = i.next()
    for v in i:
        yield (prev, v)
        prev = v


# In[50]:

def show_histogram(colname, minv, maxv, num_buckets, label):
    buckets = range(minv, maxv, (maxv - minv) / num_buckets) + [maxv]

    selected = sample.select(pyspark.sql.functions.expr(colname).alias("data"), "total_uri_count")

    label_map = {
        -1: "<{}".format(minv),
        maxv: ">={}".format(maxv)
    }

    caseExpr = pyspark.sql.functions.when(selected['data'] < minv, -1)
    for start, end in iter_with_next(buckets):
        caseExpr = caseExpr.when(selected['data'] < end, start)
        label_map[start] = start == end - 1 and str(start) or "{}-{}".format(start, end - 1)

    caseExpr = caseExpr.otherwise(maxv)

    def print_chart(data, subgroup):
        bucketed = data.select(caseExpr.alias("bucket"))

        counts = bucketed.groupBy("bucket").count().toPandas().set_index("bucket").sort_index()

        max_count = counts.max()['count']
        total_count = counts.sum()['count']

        counts.plot(kind="bar", legend=None, figsize=((num_buckets + 2) / 3, float(max_count) / total_count * 5))

        xticks = [label_map[v] for v in counts.index.values]
        plt.xticks(range(0, num_buckets + 1), xticks)
        plt.xlabel(label)

        ytick_ratios = np.arange(0.05, float(max_count) / total_count, 0.05)
        yticks = ["{:.0f}%".format(r * 100) for r in ytick_ratios]
        yticks_pos = [r * total_count for r in ytick_ratios]
        plt.yticks(yticks_pos, yticks)
        plt.ylabel("% of {} users".format(subgroup))
        plt.show()

    print_chart(selected, "all")
    print_chart(selected.where(selected['total_uri_count'] > heavy_user_cutoff), "heavy")


# In[51]:

show_histogram("subsession_length_sum / 60 / 60", 0, 160, 20, "usage hours")


# Possible conclusions:
#
# 1. heavy users tend to use their browser more (let's state the obvious!)
# 2. heavy users are more likely to be the kind of user who keeps their browser open overnight.

# In[52]:

show_histogram("subsession_count", 0, 100, 10, "# of subsessions")


# No easy conclusions here, although there appear to be a bunch of heavy users who still open and close their browser regularly? Should validate this with the ping reason data.

# In[53]:

show_histogram("max_concurrent_tab_count", 1, 25, 12, "max # of tabs open any time during the week")


# Conclusions: heavy users tend toward a few more tabs in general, and there is a more significant proportion of tab-hoarders who regularly keep >25 tabs open.

# In[54]:

show_histogram("tab_open_event_count", 0, 2000, 25, "# of tabs opened")


# In[67]:

# how many users have a null tab_open_event_count? This could mean they didn't open any tabs... need to check how this behaves more precisely.
null_tabopen_count = sample.where(pyspark.sql.functions.isnull(sample.tab_open_event_count)).count()
print "users with no tab open count: {:,} ({:.0f}%)".format(null_tabopen_count, 100.0 * null_tabopen_count / total_users)


# In[55]:

show_histogram("total_uri_count", 0, 10000, 25, "Total toplevel URIs loaded")


# In[56]:

show_histogram("total_uri_count / subsession_length_sum * 60 * 60", 0, 1000, 25, "pageloads per session-hour")


# In[57]:

show_histogram("total_uri_count / tab_open_event_count", 0, 500, 25, "page loads per tab-open-event")


# ## Other checking

# In[58]:

show_histogram("daily_session", 0, 8, 7, "'daily' pings (evidence of leaving browser on overnight)")


# Conclusion: **confirmed** that heavy users are much more likely to leave their browser open overnight.

# In[59]:

show_histogram("aborted_sessions", 0, 20, 10, "'aborted-session' ping count (evidence of crash or abrupt logoff)")


# An environment-change ping can be caused by changing some settings, installing/enabling/disabling addons. It probably doesn't mean much.

# In[60]:

show_histogram("environment_session", 0, 20, 10, "'environment-change' ping count")

	# coding: utf-8

	# # Tab/Pageload Behavior of Firefox Release Users
	#
	# Author: bsmedberg@mozilla.com
	#
	# Date: 8-May-2017
	#
	# Status: Partially-reviewed draft.

	# In[1]:

	import plotly.plotly as py
	from plotly.graph_objs import *
	import pandas as pd
	import matplotlib.pyplot as plt
	import pyspark.sql.functions
	import numpy as np


	# In[2]:

	main_summary = spark.read.option("mergeSchema", "true") .parquet('s3://telemetry-parquet/main_summary/v3/') .createOrReplaceTempView("main_summary")


	# In[37]:

	sample_unfiltered = spark.sql("""
	SELECT
	client_id,
	SUM(CASE WHEN app_version < '52' THEN 1 ELSE 0 END) AS bad_version,
	count(1) as subsession_count,
	max(max_concurrent_tab_count) as max_concurrent_tab_count,
	sum(tab_open_event_count) as tab_open_event_count,
	SUM(CASE WHEN subsession_length < 0 OR subsession_length > 172800 THEN 1 ELSE 0 END) AS bad_length,
	sum(subsession_length) as subsession_length_sum,
	sum(total_uri_count) as total_uri_count,
	SUM(CASE WHEN reason = 'aborted-session' THEN 1 ELSE 0 END) AS aborted_sessions,
	SUM(CASE WHEN reason = 'shutdown' THEN 1 ELSE 0 END) AS shutdown_session,
	SUM(CASE WHEN reason = 'environment-change' THEN 1 ELSE 0 END) AS environment_session,
	SUM(CASE WHEN reason = 'daily' THEN 1 ELSE 0 END) AS daily_session
	FROM main_summary
	WHERE
	submission_date_s3 >= '20170415' AND submission_date_s3 <= '20170429'
	AND subsession_start_date >= '2017-04-16' AND subsession_start_date < '2017-04-23'
	GROUP BY 1
	HAVING bad_version = 0 AND bad_length = 0
	"""
	).cache()


	# In[38]:

	sample_unfiltered.count()


	# # Discarding severe outliers
	#
	# When defining heavy users in terms of total pageloads, we can suffer from bots and other unreal situations that inflate pageloads. Discard anyone who loads more than a page every 10 seconds or more than 30000/week (because WTF).

	# In[39]:

	total_uri_count_unfiltered = sample_unfiltered.select(pyspark.sql.functions.sum(sample_unfiltered.total_uri_count)).first()[0]
	total_uri_count_unfiltered


	# In[40]:

	outliers = sample_unfiltered.where((sample_unfiltered.total_uri_count > 30000) \| (sample_unfiltered.subsession_length_sum / sample_unfiltered.total_uri_count < 10))
	outliers_uri_count = outliers.select(pyspark.sql.functions.sum(outliers.total_uri_count)).first()[0]

	print "total outliers: {}".format(outliers.count())
	print "total URIs loaded by outliers: {} ({:.0f}%)".format(outliers.count(), 100.0 * outliers_uri_count / total_uri_count_unfiltered)
	print "example outliers:"
	outliers.select("subsession_count", "max_concurrent_tab_count", "tab_open_event_count", "subsession_length_sum", "total_uri_count").sample(False, 0.001).toPandas()


	# In[61]:

	# how many users just have >30000 uri loads?
	sample_unfiltered.where(sample_unfiltered.total_uri_count > 30000).count()


	# In[41]:

	sample = sample_unfiltered.where((sample_unfiltered.total_uri_count <= 30000) & (sample_unfiltered.subsession_length_sum / sample_unfiltered.total_uri_count >= 10))
	sample.createOrReplaceTempView("sample")


	# ## Heavy users
	#
	# There are multiple definitions of a heavy user being proposed. We can't calculate heavy users by activeTicks as proposed at https://docs.google.com/a/mozilla.com/document/d/1TJUo1dbTA5mcWg5ZznKqYn3mpSqQeaDlqCRzUsi0_Js/edit?usp=sharing because this dataset doesn't yet have activeTicks.
	#
	# Instead, this calculates it by pageloads: at what threshold of pageloads (total_uri_count) do we find the users who are responsible for 80% of all pageloads?

	# In[42]:

	total_users = sample.count()
	total_uri_count = sample.select(pyspark.sql.functions.sum(sample.total_uri_count)).first()[0]


	# In[43]:

	ranked_by_uri = spark.sql("""
	SELECT
	total_uri_count,
	RANK() OVER (ORDER BY total_uri_count) AS rank
	FROM sample
	ORDER BY total_uri_count
	""").cache()


	# In[44]:

	def top_npct_uris(n):
	topn = ranked_by_uri.where(ranked_by_uri.rank > total_users * (100.0 - n) / 100)
	cutoff = topn.first().total_uri_count
	uri_count = topn.select(pyspark.sql.functions.sum(sample.total_uri_count)).first()[0]
	print "The top {}% of users accounted for {:.0f}% of total URI loads. (cutoff={})".format(n, 100.0 * uri_count / total_uri_count, cutoff)
	return cutoff
	top_npct_uris(1)
	top_npct_uris(5)
	heavy_user_cutoff = top_npct_uris(10)
	_ = top_npct_uris(30)


	# For the purposes of this analysis and as suggested by bcolloran, we're going to look at the top 10% of users as "heavy users".

	# In[45]:

	print "Heavy users are users who loaded more than {} pages this week.".format(heavy_user_cutoff)


	# ## Do page loads distribute differently than usage hours
	#
	# bcolloran did a previous analysis of the distribution of session hours and active ticks. We don't have active ticks yet, but let's calculate the distribution of usage hours to double-check our data and repeat the previous analysis

	# In[46]:

	total_session_length = sample.select(pyspark.sql.functions.sum(sample.subsession_length_sum)).first()[0]


	# In[47]:

	ranked_by_session_length = spark.sql("""
	SELECT
	subsession_length_sum,
	RANK() OVER (ORDER BY subsession_length_sum) AS rank
	FROM sample
	ORDER BY subsession_length_sum
	""").cache()


	# In[48]:

	def top_npct_length(n):
	topn = ranked_by_session_length.where(ranked_by_session_length.rank > total_users * (100.0 - n) / 100)
	cutoff = topn.first().subsession_length_sum
	length_sum = topn.select(pyspark.sql.functions.sum(sample.subsession_length_sum)).first()[0]
	print "The top {}% of users accounted for {:.0f}% of total session hours. (cutoff={:.1f} hours)".format(n, 100.0 * length_sum / total_session_length, cutoff / 60.0 / 60.0)

	top_npct_length(1)
	top_npct_length(5)
	top_npct_length(10)
	top_npct_length(30)


	# ## Comparing light and heavy user behavior
	#
	# The following charts compare the behavior of heavy users and normal users.

	# In[49]:

	def iter_with_next(l):
	"Generate (first, next) pairs for a list (skipping the last item)."
	i = iter(l)
	prev = i.next()
	for v in i:
	yield (prev, v)
	prev = v


	# In[50]:

	def show_histogram(colname, minv, maxv, num_buckets, label):
	buckets = range(minv, maxv, (maxv - minv) / num_buckets) + [maxv]

	selected = sample.select(pyspark.sql.functions.expr(colname).alias("data"), "total_uri_count")

	label_map = {
	-1: "<{}".format(minv),
	maxv: ">={}".format(maxv)
	}

	caseExpr = pyspark.sql.functions.when(selected['data'] < minv, -1)
	for start, end in iter_with_next(buckets):
	caseExpr = caseExpr.when(selected['data'] < end, start)
	label_map[start] = start == end - 1 and str(start) or "{}-{}".format(start, end - 1)

	caseExpr = caseExpr.otherwise(maxv)

	def print_chart(data, subgroup):
	bucketed = data.select(caseExpr.alias("bucket"))

	counts = bucketed.groupBy("bucket").count().toPandas().set_index("bucket").sort_index()

	max_count = counts.max()['count']
	total_count = counts.sum()['count']

	counts.plot(kind="bar", legend=None, figsize=((num_buckets + 2) / 3, float(max_count) / total_count * 5))

	xticks = [label_map[v] for v in counts.index.values]
	plt.xticks(range(0, num_buckets + 1), xticks)
	plt.xlabel(label)

	ytick_ratios = np.arange(0.05, float(max_count) / total_count, 0.05)
	yticks = ["{:.0f}%".format(r * 100) for r in ytick_ratios]
	yticks_pos = [r * total_count for r in ytick_ratios]
	plt.yticks(yticks_pos, yticks)
	plt.ylabel("% of {} users".format(subgroup))
	plt.show()

	print_chart(selected, "all")
	print_chart(selected.where(selected['total_uri_count'] > heavy_user_cutoff), "heavy")


	# In[51]:

	show_histogram("subsession_length_sum / 60 / 60", 0, 160, 20, "usage hours")


	# Possible conclusions:
	#
	# 1. heavy users tend to use their browser more (let's state the obvious!)
	# 2. heavy users are more likely to be the kind of user who keeps their browser open overnight.

	# In[52]:

	show_histogram("subsession_count", 0, 100, 10, "# of subsessions")


	# No easy conclusions here, although there appear to be a bunch of heavy users who still open and close their browser regularly? Should validate this with the ping reason data.

	# In[53]:

	show_histogram("max_concurrent_tab_count", 1, 25, 12, "max # of tabs open any time during the week")


	# Conclusions: heavy users tend toward a few more tabs in general, and there is a more significant proportion of tab-hoarders who regularly keep >25 tabs open.

	# In[54]:

	show_histogram("tab_open_event_count", 0, 2000, 25, "# of tabs opened")


	# In[67]:

	# how many users have a null tab_open_event_count? This could mean they didn't open any tabs... need to check how this behaves more precisely.
	null_tabopen_count = sample.where(pyspark.sql.functions.isnull(sample.tab_open_event_count)).count()
	print "users with no tab open count: {:,} ({:.0f}%)".format(null_tabopen_count, 100.0 * null_tabopen_count / total_users)


	# In[55]:

	show_histogram("total_uri_count", 0, 10000, 25, "Total toplevel URIs loaded")


	# In[56]:

	show_histogram("total_uri_count / subsession_length_sum * 60 * 60", 0, 1000, 25, "pageloads per session-hour")


	# In[57]:

	show_histogram("total_uri_count / tab_open_event_count", 0, 500, 25, "page loads per tab-open-event")


	# ## Other checking

	# In[58]:

	show_histogram("daily_session", 0, 8, 7, "'daily' pings (evidence of leaving browser on overnight)")


	# Conclusion: confirmed that heavy users are much more likely to leave their browser open overnight.

	# In[59]:

	show_histogram("aborted_sessions", 0, 20, 10, "'aborted-session' ping count (evidence of crash or abrupt logoff)")


	# An environment-change ping can be caused by changing some settings, installing/enabling/disabling addons. It probably doesn't mean much.

	# In[60]:

	show_histogram("environment_session", 0, 20, 10, "'environment-change' ping count")