-
-
Save emtwo/170201de6063d052a73a8af7beae4bb8 to your computer and use it in GitHub Desktop.
power_analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# ## Import Activity Stream Tables | |
# In[1]: | |
import time | |
import datetime | |
import pandas as pd | |
from datetime import date | |
import plotly.plotly as py | |
import plotly.graph_objs as go | |
from pyspark.sql.types import * | |
from pyspark.sql.functions import * | |
import statsmodels.stats.power as smp | |
# In[2]: | |
activity_stream_events_daily_url = "activity_stream_events_daily.csv" | |
activity_stream_stats_daily_url = "activity_stream_stats_daily.csv" | |
# In[3]: | |
pandas_events = pd.read_csv(activity_stream_events_daily_url, ",") | |
pandas_stats = pd.read_csv(activity_stream_stats_daily_url, ",") | |
# In[4]: | |
integer_types = ["max_scroll_depth", "load_latency", "total_bookmarks", "total_history_size", "session_duration"] | |
events_fields = [StructField(field_name, IntegerType(), True) if field_name in integer_types else StructField(field_name, StringType(), True) for field_name in pandas_events.columns] | |
stats_fields = [StructField(field_name, IntegerType(), True) if field_name in integer_types else StructField(field_name, StringType(), True) for field_name in pandas_stats.columns] | |
events_schema = StructType(events_fields) | |
stats_schema = StructType(stats_fields) | |
# In[5]: | |
activity_stream_events_daily_df = sqlContext.createDataFrame(pandas_events, schema=events_schema) | |
activity_stream_stats_daily_df = sqlContext.createDataFrame(pandas_stats, schema=stats_schema) | |
# In[6]: | |
sqlContext.registerDataFrameAsTable(activity_stream_events_daily_df, "activity_stream_events_daily") | |
sqlContext.registerDataFrameAsTable(activity_stream_stats_daily_df, "activity_stream_stats_daily") | |
# ## Daily Active Users Power Analysis | |
# In[48]: | |
dau = activity_stream_stats_daily_df .select("date", "client_id") .groupBy("date") .agg(countDistinct('client_id')) .orderBy(desc("date")) .limit(30) .select("date", col("count(client_id)").alias("unique_users")) | |
# In[58]: | |
dau_stats = dau.describe() | |
dau_stats.show() | |
stddev_dau = dau_stats.select("unique_users").collect()[2].unique_users | |
mean_dau = dau_stats.select("unique_users").collect()[1].unique_users | |
# In[59]: | |
diff_in_means = [] | |
sample_sizes = [] | |
percent_diff = 0.04 | |
while (percent_diff <= 0.20): | |
percent_diff += 0.001 | |
effect_size = (percent_diff * float(mean_dau)) / float(stddev_dau) | |
sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided') | |
diff_in_means.append(str(percent_diff * 100) + "%") | |
sample_sizes.append(sample_size) | |
# In[62]: | |
data = [go.Scatter( | |
x = diff_in_means, | |
y = sample_sizes | |
)] | |
layout = dict(title = 'DAU Power Analysis - Alpha = 0.05. Power = 0.95', | |
xaxis = dict(title = 'Percent Difference in Means (Daily Active Users)'), | |
yaxis = dict(title = 'Sample Size (Number of Days)')) | |
fig = dict(data=data, layout=layout) | |
py.iplot(fig) | |
# ##### DAU Plot: https://plot.ly/~emtwo/87/ | |
# ## Conclusion: | |
# | |
# #### Need to run the experiment for ~60 days to observe a 9% difference (~674 DAU). | |
# ## Daily Tabs Open Power Analysis | |
# In[72]: | |
# Schema: |Date|Tabs Open Count| | |
newtab_counts = activity_stream_stats_daily_df .select("date", "load_reason") .where(col("load_reason") == "newtab") .groupBy("date").count() .select("date", col("count").alias("num_tabs")) .orderBy(desc("date")) .limit(30) | |
# In[73]: | |
newtab_stats = newtab_counts.describe() | |
newtab_stats.show() | |
stddev_tab_count = newtab_stats.select("num_tabs").collect()[2].num_tabs | |
mean_tab_count = newtab_stats.select("num_tabs").collect()[1].num_tabs | |
# In[74]: | |
diff_in_means = [] | |
sample_sizes = [] | |
percent_diff = 0.01 | |
while (percent_diff <= 0.15): | |
percent_diff += 0.001 | |
effect_size = (percent_diff * float(mean_tab_count)) / float(stddev_tab_count) | |
sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided') | |
diff_in_means.append(str(percent_diff * 100) + "%") | |
sample_sizes.append(sample_size) | |
# In[75]: | |
data = [go.Scatter( | |
x = diff_in_means, | |
y = sample_sizes | |
)] | |
layout = dict(title = 'Daily Open Tabs Power Analysis - Alpha = 0.05. Power = 0.95', | |
xaxis = dict(title = 'Percent Difference in Means (Tab Opens Per Day)'), | |
yaxis = dict(title = 'Sample Size (Number of Days)')) | |
fig = dict(data=data, layout=layout) | |
py.iplot(fig) | |
# #### Daily Open Tabs Plot: https://plot.ly/~emtwo/95/ | |
# ## Conclusion: | |
# | |
# #### 1) Need to run the experiment for ~103 days to observe a 10% difference (~6000 new tabs). | |
# #### 2) Need to run the experiment for ~60 days to observe a 13% difference (~7900 new tabs). | |
# #### 3) It's unlikely we will observe a significant difference in newtabs open per day. | |
# ## Daily Tabs Open Per User Power Analysis | |
# In[67]: | |
july_fifteen = time.mktime(datetime.datetime.strptime('15/07/2016', "%d/%m/%Y").timetuple()) | |
aug_fifteen = time.mktime(datetime.datetime.strptime('15/08/2016', "%d/%m/%Y").timetuple()) | |
# In[68]: | |
newtab_counts = activity_stream_stats_daily_df .select("client_id", "receive_at", "load_reason") .withColumn("date", unix_timestamp("receive_at")) .where(col("load_reason") == "newtab") .where(col("date") >= july_fifteen) .where(col("date") < aug_fifteen) .select("client_id", "load_reason") .groupBy("client_id").count() .select("client_id", (col("count") / 31).alias("num_tabs")) .orderBy("num_tabs") | |
# In[69]: | |
newtab_stats = newtab_counts.describe() | |
newtab_stats.show() | |
stddev_tab_count = newtab_stats.select("num_tabs").collect()[2].num_tabs | |
mean_tab_count = newtab_stats.select("num_tabs").collect()[1].num_tabs | |
# In[70]: | |
diff_in_means = [] | |
sample_sizes = [] | |
percent_diff = 0.05 | |
while (percent_diff <= 0.2): | |
percent_diff += 0.001 | |
effect_size = (percent_diff * float(mean_tab_count)) / float(stddev_tab_count) | |
sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided') | |
diff_in_means.append(str(percent_diff * 100) + "%") | |
sample_sizes.append(sample_size) | |
# In[71]: | |
data = [go.Scatter( | |
x = diff_in_means, | |
y = sample_sizes | |
)] | |
layout = dict(title = 'Daily Tabs Open Per User Power Analysis - Alpha = 0.05. Power = 0.95', | |
xaxis = dict(title = 'Percent Difference in Means (Tab Opens Per User Per Day)'), | |
yaxis = dict(title = 'Sample Size (Number of Users)')) | |
fig = dict(data=data, layout=layout) | |
py.iplot(fig) | |
# #### Daily Tabs Open Per User Plot: https://plot.ly/~emtwo/93/ | |
# ## Conclusion: | |
# | |
# #### Need ~8000 users per arm to observe a 10% difference (~0.324 new tabs per user per day). | |
# ## Daily Searches Power Analysis | |
# In[76]: | |
search_counts = activity_stream_events_daily_df .select("client_id", "receive_at", "event") .withColumn("date", unix_timestamp("receive_at")) .where(col("event") == "SEARCH") .where(col("date") >= july_fifteen) .where(col("date") < aug_fifteen) .groupBy("client_id").count() .select("client_id", (col("count") / 31).alias("num_search")) .orderBy(desc("num_search")) | |
# In[77]: | |
search_stats = search_counts.describe() | |
search_stats.show() | |
stddev_search_count = search_stats.select("num_search").collect()[2].num_search | |
mean_search_count = search_stats.select("num_search").collect()[1].num_search | |
# In[78]: | |
diff_in_means = [] | |
sample_sizes = [] | |
percent_diff = 0.05 | |
while (percent_diff <= 0.2): | |
percent_diff += 0.001 | |
effect_size = (percent_diff * float(mean_search_count)) / float(stddev_search_count) | |
sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided') | |
diff_in_means.append(str(percent_diff * 100) + "%") | |
sample_sizes.append(sample_size) | |
# In[79]: | |
data = [go.Scatter( | |
x = diff_in_means, | |
y = sample_sizes | |
)] | |
layout = dict(title = 'Daily Searches Power Analysis - Alpha = 0.05. Power = 0.95', | |
xaxis = dict(title = 'Percent Difference in Means (Searches Per User Per Day)'), | |
yaxis = dict(title = 'Sample Size (Number of Users)')) | |
fig = dict(data=data, layout=layout) | |
py.iplot(fig) | |
# #### Daily Searches Per User Plot: https://plot.ly/~emtwo/97/ | |
# ## Conclusion: | |
# | |
# #### Need ~8114 users per arm to observe a 12% difference (~0.057 searches per user per day). | |
# ## Topsites Clicks Power Analysis | |
# In[83]: | |
topsites_clickthrough = activity_stream_events_daily_df .select("client_id", "receive_at", "event", "source") .withColumn("date", unix_timestamp("receive_at")) .where(col("event") == "CLICK") .where(col("source") == "TOP_SITES") .where(col("date") >= july_fifteen) .where(col("date") < aug_fifteen) .groupBy("client_id").count() .select("client_id", (col("count") / 31).alias("num_clicks")) | |
# In[84]: | |
topsites_stats = topsites_clickthrough.describe() | |
topsites_stats.show() | |
stddev_topsites = topsites_stats.select("num_clicks").collect()[2].num_clicks | |
mean_topsites_count = topsites_stats.select("num_clicks").collect()[1].num_clicks | |
# In[85]: | |
diff_in_means = [] | |
sample_sizes = [] | |
percent_diff = 0.05 | |
while (percent_diff <= 0.2): | |
percent_diff += 0.001 | |
effect_size = (percent_diff * float(mean_topsites_count)) / float(stddev_topsites) | |
sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided') | |
diff_in_means.append(str(percent_diff * 100) + "%") | |
sample_sizes.append(sample_size) | |
# In[86]: | |
data = [go.Scatter( | |
x = diff_in_means, | |
y = sample_sizes | |
)] | |
layout = dict(title = 'Topsite Clicks Power Analysis - Alpha = 0.05. Power = 0.95', | |
xaxis = dict(title = 'Percent Difference in Means (Topsite Clicks Per User Per Day)'), | |
yaxis = dict(title = 'Sample Size (Number of Users)')) | |
fig = dict(data=data, layout=layout) | |
py.iplot(fig) | |
# #### Topsite Clicks Plot: https://plot.ly/~emtwo/99/ | |
# ## Conclusion: | |
# | |
# #### Need ~7360 users per arm to observe a 15% difference (~0.105 topsite clicks per user per day). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment