Skip to content

Instantly share code, notes, and snippets.

@teonbrooks
Last active March 20, 2018 01:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save teonbrooks/a271aa4fd7798887bbdc77f14cf4efb6 to your computer and use it in GitHub Desktop.
Save teonbrooks/a271aa4fd7798887bbdc77f14cf4efb6 to your computer and use it in GitHub Desktop.
Updated TxP
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# In[1]:
get_ipython().magic(u'matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
# In[2]:
# this version of pyspark instantiates a SparkContext `sc` and SQLContext `sqlContext`
import pyspark.sql.functions as fun
from pyspark.sql.window import Window
from pyspark.sql import Row
# In[3]:
SIXM_DATE = '20170801'
M_DATE = '20180201'
END_DATE = '20180228'
A_DATE = '20180201'
# In[4]:
txp_dau = sqlContext.read.option("mergeSchema", "true") .parquet("s3://telemetry-parquet/txp_mau_dau_simple/v3")
# In[5]:
txp_dau_df = txp_dau.filter(txp_dau.submission_date_s3.between(int(SIXM_DATE), int(END_DATE))) .filter(txp_dau.test =='testpilot') .toPandas()
# ## TxP DAU-MAU
# Let's look at the number of TxP client_ids over the past six months
# In[6]:
txp_dau_df.sort_values('submission_date_s3', inplace=True)
# In[7]:
txp_dau_df['date'] = pd.to_datetime(txp_dau_df['submission_date_s3'], format='%Y%m%d')
# In[8]:
txp_dau_df.plot('date', 'dau', title='TxP DAU')
# In[9]:
txp_dau_df.plot('date', 'mau', title='TxP DAU')
# Aside: This commented-out section is a way to get an estimate of clients for TxP from main_summary.
# In[10]:
# six_months = data.filter(data.submission_date_s3.between(int(SIXM_DATE), int(END_DATE))) \
# .filter("app_name = 'Firefox'") \
# .select("client_id",
# "submission_date_s3",
# fun.array_contains(data.active_addons.addon_id, '@testpilot-addon').alias('txp_added'))
# In[11]:
# six_months.printSchema()
# In[12]:
# txp_total = six_months.filter("sample_id % 10 = 1") \
# .filter(six_months.txp_added == True) \
# .groupby("submission_date_s3") \
# .agg(fun.sumDistinct("client_id").alias("client_id"))
# In[13]:
# txp_six_months = txp_total.collect()
# In[14]:
# plt.plot(txp_total.submission_date_s3, txp_total.client_id)
# ## Characteristics of TxP users vs General Fx users
# In[15]:
data = sqlContext.read.option("mergeSchema", "true") .parquet("s3://telemetry-parquet/main_summary/v4")
# In[16]:
ms = data.filter("submission_date_s3 = '{}'".format(A_DATE)) .filter("sample_id % 10 = 1") .filter("app_name = 'Firefox'") .withColumn('txp_added', fun.array_contains(data.active_addons.addon_id, '@testpilot-addon'))
# In[17]:
txp_total = ms.filter(ms.txp_added == True).select("client_id").distinct().count()
# In[18]:
total = ms.select("client_id").distinct().count()
# Test Pilot percentage estimate
# In[19]:
'Test Pilot users constitute {0:.2f}% of the total Firefox population based on 10% sample'.format(txp_total * 100 / float(total))
# Screenshots counts for a single day in February
# In[20]:
screenshots_day = ms.select("scalar_parent_screenshots_copy",
"scalar_parent_screenshots_download",
"scalar_parent_screenshots_upload")
# In[21]:
tmp = screenshots_day.agg(fun.sum("scalar_parent_screenshots_copy"),
fun.sum("scalar_parent_screenshots_download"),
fun.sum("scalar_parent_screenshots_upload")).collect()
pd.DataFrame(map(lambda x: x.asDict(), tmp))*10
# Now let's look over a month period
# In[22]:
screenshots = data.filter(data.submission_date_s3.between(int(M_DATE), int(END_DATE))) .filter("app_name = 'Firefox'") .select("submission_date_s3",
"scalar_parent_screenshots_copy",
"scalar_parent_screenshots_download",
"scalar_parent_screenshots_upload")
# In[23]:
tmp = screenshots.groupby('submission_date_s3') .agg(fun.sum("scalar_parent_screenshots_copy").alias('copy'),
fun.sum("scalar_parent_screenshots_download").alias('download'),
fun.sum("scalar_parent_screenshots_upload").alias('upload'))
# In[24]:
screenshots_df = tmp.toPandas()
# In[25]:
screenshots_df['date'] = pd.to_datetime(screenshots_df['submission_date_s3'], format='%Y%m%d')
# In[26]:
screenshots_df.sort_values('date', inplace=True)
# In[27]:
screenshots_df.plot('date', 'copy')
# In[28]:
screenshots_df.plot('date', 'download')
# In[29]:
screenshots_df.plot('date', 'upload')
# In[30]:
txp = ms.filter("sample_id % 100 = 1") .select("client_id",
"profile_creation_date",
"country",
ms.normalized_channel.alias("channel"),
"submission_date",
"subsession_start_date",
ms.subsession_start_date.substr(0,10).alias("date"),
"subsession_length",
"is_default_browser",
"active_addons_count",
"active_ticks",
"places_bookmarks_count",
ms.active_addons["addon_id"].alias('addon_ids'),
ms.active_addons["name"].alias('addon_names'),
ms.active_addons.is_system.alias('system_addon'),
ms.active_addons.foreign_install.alias('foreign_addon'),
'txp_added'
)
# Because there can be multiple pings per day, one must agg over subsessions.
# Later, I plan to segregate addons based on their status (system vs foreign vs user), I need to be consistent in which information I take. I will select the first instance of addon instances for the comparisons.
# In[31]:
txp_agg = txp.groupBy("client_id") .agg(fun.first("txp_added").alias("txp_added"),
# relevant addons
fun.first("addon_ids").alias("addon_ids"),
fun.first("addon_names").alias("addon_names"),
fun.first("system_addon").alias("system_addons"),
fun.first("foreign_addon").alias("foreign_addons"),
fun.first("active_addons_count").alias("active_addons"),
# other metrics
fun.max("profile_creation_date").alias("profile_creation_date"),
fun.sum("subsession_length").alias("session_length"),
fun.sum("active_ticks").alias("active_ticks"),
fun.max("places_bookmarks_count").alias("bookmarks"),
fun.first("country").alias("country"),
fun.first("channel").alias("channel"),
)
# In[32]:
txp_df = txp_agg.toPandas()
# In[33]:
def BoolArray2Sum(array):
new_array = [int(val) for val in array if val is not None]
array_sum = np.sum(new_array)
return array_sum
# In[34]:
txp_df['txp_added'].replace({0: 'fx', 1: 'txp'}, inplace=True)
# In[35]:
# Deal with missing data and implausible values
txp_df.dropna(inplace=True)
txp_df = txp_df[(txp_df > 0).all(1)]
# In[36]:
txp_df['system_addon_count'] = txp_df['system_addons'].apply(BoolArray2Sum)
# In[37]:
txp_df['foreign_addon_count'] = txp_df['foreign_addons'].apply(BoolArray2Sum)
# In[38]:
txp_df['user_addon_count'] = txp_df['active_addons'] - txp_df['system_addon_count'] - txp_df['foreign_addon_count']
# In[39]:
# Handle outliers
txp_df = txp_df[np.abs(txp_df['active_addons'] - txp_df['active_addons'].mean()) < txp_df['active_addons'].std()*3]
# In[40]:
txp_df.groupby('txp_added').describe()
# Below are some comparison plots
# In[41]:
txp_df.boxplot(['active_addons'], by='txp_added');
# In[42]:
txp_df.boxplot(['system_addon_count'], by='txp_added');
# In[43]:
txp_df.hist('system_addon_count', by='txp_added');
# In[44]:
txp_df.boxplot(['foreign_addon_count'], by='txp_added');
# In[45]:
txp_df.boxplot('user_addon_count', by='txp_added')
# In[46]:
txp_df.groupby('txp_added')['user_addon_count'].describe()
# In[47]:
txp_df.hist('bookmarks', by='txp_added')
# In[48]:
txp_df.boxplot('profile_creation_date', by='txp_added')
# In[50]:
txp_df.boxplot('session_length', by='txp_added')
# In[55]:
txp_df.boxplot('active_ticks', by='txp_added')
# In[71]:
txp_df.groupby(['txp_added', 'channel']).agg({'channel': 'count'})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment