Skip to content

Instantly share code, notes, and snippets.

@emtwo
Last active September 23, 2016 20:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save emtwo/170201de6063d052a73a8af7beae4bb8 to your computer and use it in GitHub Desktop.
Save emtwo/170201de6063d052a73a8af7beae4bb8 to your computer and use it in GitHub Desktop.
power_analysis
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# ## Import Activity Stream Tables
# In[1]:
import time
import datetime
import pandas as pd
from datetime import date
import plotly.plotly as py
import plotly.graph_objs as go
from pyspark.sql.types import *
from pyspark.sql.functions import *
import statsmodels.stats.power as smp
# In[2]:
activity_stream_events_daily_url = "activity_stream_events_daily.csv"
activity_stream_stats_daily_url = "activity_stream_stats_daily.csv"
# In[3]:
pandas_events = pd.read_csv(activity_stream_events_daily_url, ",")
pandas_stats = pd.read_csv(activity_stream_stats_daily_url, ",")
# In[4]:
integer_types = ["max_scroll_depth", "load_latency", "total_bookmarks", "total_history_size", "session_duration"]
events_fields = [StructField(field_name, IntegerType(), True) if field_name in integer_types else StructField(field_name, StringType(), True) for field_name in pandas_events.columns]
stats_fields = [StructField(field_name, IntegerType(), True) if field_name in integer_types else StructField(field_name, StringType(), True) for field_name in pandas_stats.columns]
events_schema = StructType(events_fields)
stats_schema = StructType(stats_fields)
# In[5]:
activity_stream_events_daily_df = sqlContext.createDataFrame(pandas_events, schema=events_schema)
activity_stream_stats_daily_df = sqlContext.createDataFrame(pandas_stats, schema=stats_schema)
# In[6]:
sqlContext.registerDataFrameAsTable(activity_stream_events_daily_df, "activity_stream_events_daily")
sqlContext.registerDataFrameAsTable(activity_stream_stats_daily_df, "activity_stream_stats_daily")
# ## Daily Active Users Power Analysis
# In[48]:
dau = activity_stream_stats_daily_df .select("date", "client_id") .groupBy("date") .agg(countDistinct('client_id')) .orderBy(desc("date")) .limit(30) .select("date", col("count(client_id)").alias("unique_users"))
# In[58]:
dau_stats = dau.describe()
dau_stats.show()
stddev_dau = dau_stats.select("unique_users").collect()[2].unique_users
mean_dau = dau_stats.select("unique_users").collect()[1].unique_users
# In[59]:
diff_in_means = []
sample_sizes = []
percent_diff = 0.04
while (percent_diff <= 0.20):
percent_diff += 0.001
effect_size = (percent_diff * float(mean_dau)) / float(stddev_dau)
sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')
diff_in_means.append(str(percent_diff * 100) + "%")
sample_sizes.append(sample_size)
# In[62]:
data = [go.Scatter(
x = diff_in_means,
y = sample_sizes
)]
layout = dict(title = 'DAU Power Analysis - Alpha = 0.05. Power = 0.95',
xaxis = dict(title = 'Percent Difference in Means (Daily Active Users)'),
yaxis = dict(title = 'Sample Size (Number of Days)'))
fig = dict(data=data, layout=layout)
py.iplot(fig)
# ##### DAU Plot: https://plot.ly/~emtwo/87/
# ## Conclusion:
#
# #### Need to run the experiment for ~60 days to observe a 9% difference (~674 DAU).
# ## Daily Tabs Open Power Analysis
# In[72]:
# Schema: |Date|Tabs Open Count|
newtab_counts = activity_stream_stats_daily_df .select("date", "load_reason") .where(col("load_reason") == "newtab") .groupBy("date").count() .select("date", col("count").alias("num_tabs")) .orderBy(desc("date")) .limit(30)
# In[73]:
newtab_stats = newtab_counts.describe()
newtab_stats.show()
stddev_tab_count = newtab_stats.select("num_tabs").collect()[2].num_tabs
mean_tab_count = newtab_stats.select("num_tabs").collect()[1].num_tabs
# In[74]:
diff_in_means = []
sample_sizes = []
percent_diff = 0.01
while (percent_diff <= 0.15):
percent_diff += 0.001
effect_size = (percent_diff * float(mean_tab_count)) / float(stddev_tab_count)
sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')
diff_in_means.append(str(percent_diff * 100) + "%")
sample_sizes.append(sample_size)
# In[75]:
data = [go.Scatter(
x = diff_in_means,
y = sample_sizes
)]
layout = dict(title = 'Daily Open Tabs Power Analysis - Alpha = 0.05. Power = 0.95',
xaxis = dict(title = 'Percent Difference in Means (Tab Opens Per Day)'),
yaxis = dict(title = 'Sample Size (Number of Days)'))
fig = dict(data=data, layout=layout)
py.iplot(fig)
# #### Daily Open Tabs Plot: https://plot.ly/~emtwo/95/
# ## Conclusion:
#
# #### 1) Need to run the experiment for ~103 days to observe a 10% difference (~6000 new tabs).
# #### 2) Need to run the experiment for ~60 days to observe a 13% difference (~7900 new tabs).
# #### 3) It's unlikely we will observe a significant difference in newtabs open per day.
# ## Daily Tabs Open Per User Power Analysis
# In[67]:
july_fifteen = time.mktime(datetime.datetime.strptime('15/07/2016', "%d/%m/%Y").timetuple())
aug_fifteen = time.mktime(datetime.datetime.strptime('15/08/2016', "%d/%m/%Y").timetuple())
# In[68]:
newtab_counts = activity_stream_stats_daily_df .select("client_id", "receive_at", "load_reason") .withColumn("date", unix_timestamp("receive_at")) .where(col("load_reason") == "newtab") .where(col("date") >= july_fifteen) .where(col("date") < aug_fifteen) .select("client_id", "load_reason") .groupBy("client_id").count() .select("client_id", (col("count") / 31).alias("num_tabs")) .orderBy("num_tabs")
# In[69]:
newtab_stats = newtab_counts.describe()
newtab_stats.show()
stddev_tab_count = newtab_stats.select("num_tabs").collect()[2].num_tabs
mean_tab_count = newtab_stats.select("num_tabs").collect()[1].num_tabs
# In[70]:
diff_in_means = []
sample_sizes = []
percent_diff = 0.05
while (percent_diff <= 0.2):
percent_diff += 0.001
effect_size = (percent_diff * float(mean_tab_count)) / float(stddev_tab_count)
sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')
diff_in_means.append(str(percent_diff * 100) + "%")
sample_sizes.append(sample_size)
# In[71]:
data = [go.Scatter(
x = diff_in_means,
y = sample_sizes
)]
layout = dict(title = 'Daily Tabs Open Per User Power Analysis - Alpha = 0.05. Power = 0.95',
xaxis = dict(title = 'Percent Difference in Means (Tab Opens Per User Per Day)'),
yaxis = dict(title = 'Sample Size (Number of Users)'))
fig = dict(data=data, layout=layout)
py.iplot(fig)
# #### Daily Tabs Open Per User Plot: https://plot.ly/~emtwo/93/
# ## Conclusion:
#
# #### Need ~8000 users per arm to observe a 10% difference (~0.324 new tabs per user per day).
# ## Daily Searches Power Analysis
# In[76]:
search_counts = activity_stream_events_daily_df .select("client_id", "receive_at", "event") .withColumn("date", unix_timestamp("receive_at")) .where(col("event") == "SEARCH") .where(col("date") >= july_fifteen) .where(col("date") < aug_fifteen) .groupBy("client_id").count() .select("client_id", (col("count") / 31).alias("num_search")) .orderBy(desc("num_search"))
# In[77]:
search_stats = search_counts.describe()
search_stats.show()
stddev_search_count = search_stats.select("num_search").collect()[2].num_search
mean_search_count = search_stats.select("num_search").collect()[1].num_search
# In[78]:
diff_in_means = []
sample_sizes = []
percent_diff = 0.05
while (percent_diff <= 0.2):
percent_diff += 0.001
effect_size = (percent_diff * float(mean_search_count)) / float(stddev_search_count)
sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')
diff_in_means.append(str(percent_diff * 100) + "%")
sample_sizes.append(sample_size)
# In[79]:
data = [go.Scatter(
x = diff_in_means,
y = sample_sizes
)]
layout = dict(title = 'Daily Searches Power Analysis - Alpha = 0.05. Power = 0.95',
xaxis = dict(title = 'Percent Difference in Means (Searches Per User Per Day)'),
yaxis = dict(title = 'Sample Size (Number of Users)'))
fig = dict(data=data, layout=layout)
py.iplot(fig)
# #### Daily Searches Per User Plot: https://plot.ly/~emtwo/97/
# ## Conclusion:
#
# #### Need ~8114 users per arm to observe a 12% difference (~0.057 searches per user per day).
# ## Topsites Clicks Power Analysis
# In[83]:
topsites_clickthrough = activity_stream_events_daily_df .select("client_id", "receive_at", "event", "source") .withColumn("date", unix_timestamp("receive_at")) .where(col("event") == "CLICK") .where(col("source") == "TOP_SITES") .where(col("date") >= july_fifteen) .where(col("date") < aug_fifteen) .groupBy("client_id").count() .select("client_id", (col("count") / 31).alias("num_clicks"))
# In[84]:
topsites_stats = topsites_clickthrough.describe()
topsites_stats.show()
stddev_topsites = topsites_stats.select("num_clicks").collect()[2].num_clicks
mean_topsites_count = topsites_stats.select("num_clicks").collect()[1].num_clicks
# In[85]:
diff_in_means = []
sample_sizes = []
percent_diff = 0.05
while (percent_diff <= 0.2):
percent_diff += 0.001
effect_size = (percent_diff * float(mean_topsites_count)) / float(stddev_topsites)
sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')
diff_in_means.append(str(percent_diff * 100) + "%")
sample_sizes.append(sample_size)
# In[86]:
data = [go.Scatter(
x = diff_in_means,
y = sample_sizes
)]
layout = dict(title = 'Topsite Clicks Power Analysis - Alpha = 0.05. Power = 0.95',
xaxis = dict(title = 'Percent Difference in Means (Topsite Clicks Per User Per Day)'),
yaxis = dict(title = 'Sample Size (Number of Users)'))
fig = dict(data=data, layout=layout)
py.iplot(fig)
# #### Topsite Clicks Plot: https://plot.ly/~emtwo/99/
# ## Conclusion:
#
# #### Need ~7360 users per arm to observe a 15% difference (~0.105 topsite clicks per user per day).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment