-
-
Save teonbrooks/a271aa4fd7798887bbdc77f14cf4efb6 to your computer and use it in GitHub Desktop.
Updated TxP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[1]: | |
get_ipython().magic(u'matplotlib inline') | |
import pandas as pd | |
import numpy as np | |
import matplotlib as mpl | |
from matplotlib import pyplot as plt | |
# In[2]: | |
# this version of pyspark instantiates a SparkContext `sc` and SQLContext `sqlContext` | |
import pyspark.sql.functions as fun | |
from pyspark.sql.window import Window | |
from pyspark.sql import Row | |
# In[3]: | |
SIXM_DATE = '20170801' | |
M_DATE = '20180201' | |
END_DATE = '20180228' | |
A_DATE = '20180201' | |
# In[4]: | |
txp_dau = sqlContext.read.option("mergeSchema", "true") .parquet("s3://telemetry-parquet/txp_mau_dau_simple/v3") | |
# In[5]: | |
txp_dau_df = txp_dau.filter(txp_dau.submission_date_s3.between(int(SIXM_DATE), int(END_DATE))) .filter(txp_dau.test =='testpilot') .toPandas() | |
# ## TxP DAU-MAU | |
# Let's look at the number of TxP client_ids over the past six months | |
# In[6]: | |
txp_dau_df.sort_values('submission_date_s3', inplace=True) | |
# In[7]: | |
txp_dau_df['date'] = pd.to_datetime(txp_dau_df['submission_date_s3'], format='%Y%m%d') | |
# In[8]: | |
txp_dau_df.plot('date', 'dau', title='TxP DAU') | |
# In[9]: | |
txp_dau_df.plot('date', 'mau', title='TxP DAU') | |
# Aside: This commented-out section is a way to get an estimate of clients for TxP from main_summary. | |
# In[10]: | |
# six_months = data.filter(data.submission_date_s3.between(int(SIXM_DATE), int(END_DATE))) \ | |
# .filter("app_name = 'Firefox'") \ | |
# .select("client_id", | |
# "submission_date_s3", | |
# fun.array_contains(data.active_addons.addon_id, '@testpilot-addon').alias('txp_added')) | |
# In[11]: | |
# six_months.printSchema() | |
# In[12]: | |
# txp_total = six_months.filter("sample_id % 10 = 1") \ | |
# .filter(six_months.txp_added == True) \ | |
# .groupby("submission_date_s3") \ | |
# .agg(fun.sumDistinct("client_id").alias("client_id")) | |
# In[13]: | |
# txp_six_months = txp_total.collect() | |
# In[14]: | |
# plt.plot(txp_total.submission_date_s3, txp_total.client_id) | |
# ## Characteristics of TxP users vs General Fx users | |
# In[15]: | |
data = sqlContext.read.option("mergeSchema", "true") .parquet("s3://telemetry-parquet/main_summary/v4") | |
# In[16]: | |
ms = data.filter("submission_date_s3 = '{}'".format(A_DATE)) .filter("sample_id % 10 = 1") .filter("app_name = 'Firefox'") .withColumn('txp_added', fun.array_contains(data.active_addons.addon_id, '@testpilot-addon')) | |
# In[17]: | |
txp_total = ms.filter(ms.txp_added == True).select("client_id").distinct().count() | |
# In[18]: | |
total = ms.select("client_id").distinct().count() | |
# Test Pilot percentage estimate | |
# In[19]: | |
'Test Pilot users constitute {0:.2f}% of the total Firefox population based on 10% sample'.format(txp_total * 100 / float(total)) | |
# Screenshots counts for a single day in February | |
# In[20]: | |
screenshots_day = ms.select("scalar_parent_screenshots_copy", | |
"scalar_parent_screenshots_download", | |
"scalar_parent_screenshots_upload") | |
# In[21]: | |
tmp = screenshots_day.agg(fun.sum("scalar_parent_screenshots_copy"), | |
fun.sum("scalar_parent_screenshots_download"), | |
fun.sum("scalar_parent_screenshots_upload")).collect() | |
pd.DataFrame(map(lambda x: x.asDict(), tmp))*10 | |
# Now let's look over a month period | |
# In[22]: | |
screenshots = data.filter(data.submission_date_s3.between(int(M_DATE), int(END_DATE))) .filter("app_name = 'Firefox'") .select("submission_date_s3", | |
"scalar_parent_screenshots_copy", | |
"scalar_parent_screenshots_download", | |
"scalar_parent_screenshots_upload") | |
# In[23]: | |
tmp = screenshots.groupby('submission_date_s3') .agg(fun.sum("scalar_parent_screenshots_copy").alias('copy'), | |
fun.sum("scalar_parent_screenshots_download").alias('download'), | |
fun.sum("scalar_parent_screenshots_upload").alias('upload')) | |
# In[24]: | |
screenshots_df = tmp.toPandas() | |
# In[25]: | |
screenshots_df['date'] = pd.to_datetime(screenshots_df['submission_date_s3'], format='%Y%m%d') | |
# In[26]: | |
screenshots_df.sort_values('date', inplace=True) | |
# In[27]: | |
screenshots_df.plot('date', 'copy') | |
# In[28]: | |
screenshots_df.plot('date', 'download') | |
# In[29]: | |
screenshots_df.plot('date', 'upload') | |
# In[30]: | |
txp = ms.filter("sample_id % 100 = 1") .select("client_id", | |
"profile_creation_date", | |
"country", | |
ms.normalized_channel.alias("channel"), | |
"submission_date", | |
"subsession_start_date", | |
ms.subsession_start_date.substr(0,10).alias("date"), | |
"subsession_length", | |
"is_default_browser", | |
"active_addons_count", | |
"active_ticks", | |
"places_bookmarks_count", | |
ms.active_addons["addon_id"].alias('addon_ids'), | |
ms.active_addons["name"].alias('addon_names'), | |
ms.active_addons.is_system.alias('system_addon'), | |
ms.active_addons.foreign_install.alias('foreign_addon'), | |
'txp_added' | |
) | |
# Because there can be multiple pings per day, one must agg over subsessions. | |
# Later, I plan to segregate addons based on their status (system vs foreign vs user), I need to be consistent in which information I take. I will select the first instance of addon instances for the comparisons. | |
# In[31]: | |
txp_agg = txp.groupBy("client_id") .agg(fun.first("txp_added").alias("txp_added"), | |
# relevant addons | |
fun.first("addon_ids").alias("addon_ids"), | |
fun.first("addon_names").alias("addon_names"), | |
fun.first("system_addon").alias("system_addons"), | |
fun.first("foreign_addon").alias("foreign_addons"), | |
fun.first("active_addons_count").alias("active_addons"), | |
# other metrics | |
fun.max("profile_creation_date").alias("profile_creation_date"), | |
fun.sum("subsession_length").alias("session_length"), | |
fun.sum("active_ticks").alias("active_ticks"), | |
fun.max("places_bookmarks_count").alias("bookmarks"), | |
fun.first("country").alias("country"), | |
fun.first("channel").alias("channel"), | |
) | |
# In[32]: | |
txp_df = txp_agg.toPandas() | |
# In[33]: | |
def BoolArray2Sum(array): | |
new_array = [int(val) for val in array if val is not None] | |
array_sum = np.sum(new_array) | |
return array_sum | |
# In[34]: | |
txp_df['txp_added'].replace({0: 'fx', 1: 'txp'}, inplace=True) | |
# In[35]: | |
# Deal with missing data and implausible values | |
txp_df.dropna(inplace=True) | |
txp_df = txp_df[(txp_df > 0).all(1)] | |
# In[36]: | |
txp_df['system_addon_count'] = txp_df['system_addons'].apply(BoolArray2Sum) | |
# In[37]: | |
txp_df['foreign_addon_count'] = txp_df['foreign_addons'].apply(BoolArray2Sum) | |
# In[38]: | |
txp_df['user_addon_count'] = txp_df['active_addons'] - txp_df['system_addon_count'] - txp_df['foreign_addon_count'] | |
# In[39]: | |
# Handle outliers | |
txp_df = txp_df[np.abs(txp_df['active_addons'] - txp_df['active_addons'].mean()) < txp_df['active_addons'].std()*3] | |
# In[40]: | |
txp_df.groupby('txp_added').describe() | |
# Below are some comparison plots | |
# In[41]: | |
txp_df.boxplot(['active_addons'], by='txp_added'); | |
# In[42]: | |
txp_df.boxplot(['system_addon_count'], by='txp_added'); | |
# In[43]: | |
txp_df.hist('system_addon_count', by='txp_added'); | |
# In[44]: | |
txp_df.boxplot(['foreign_addon_count'], by='txp_added'); | |
# In[45]: | |
txp_df.boxplot('user_addon_count', by='txp_added') | |
# In[46]: | |
txp_df.groupby('txp_added')['user_addon_count'].describe() | |
# In[47]: | |
txp_df.hist('bookmarks', by='txp_added') | |
# In[48]: | |
txp_df.boxplot('profile_creation_date', by='txp_added') | |
# In[50]: | |
txp_df.boxplot('session_length', by='txp_added') | |
# In[55]: | |
txp_df.boxplot('active_ticks', by='txp_added') | |
# In[71]: | |
txp_df.groupby(['txp_added', 'channel']).agg({'channel': 'count'}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment