emtwo/power_analysis.ipynb Secret

## power_analysis.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              power_analysis.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## power_analysis.py

# coding: utf-8

# ## Import Activity Stream Tables

# In[1]:

import time
import datetime
import pandas as pd
from datetime import date
import plotly.plotly as py
import plotly.graph_objs as go
from pyspark.sql.types import *
from pyspark.sql.functions import *
import statsmodels.stats.power as smp


# In[2]:

activity_stream_events_daily_url = "activity_stream_events_daily.csv"
activity_stream_stats_daily_url = "activity_stream_stats_daily.csv"


# In[3]:

pandas_events = pd.read_csv(activity_stream_events_daily_url, ",")
pandas_stats = pd.read_csv(activity_stream_stats_daily_url, ",")


# In[4]:

integer_types = ["max_scroll_depth", "load_latency", "total_bookmarks", "total_history_size", "session_duration"]

events_fields = [StructField(field_name, IntegerType(), True) if field_name in integer_types else StructField(field_name, StringType(), True) for field_name in pandas_events.columns]
stats_fields = [StructField(field_name, IntegerType(), True) if field_name in integer_types else StructField(field_name, StringType(), True) for field_name in pandas_stats.columns]

events_schema = StructType(events_fields)
stats_schema = StructType(stats_fields)


# In[5]:

activity_stream_events_daily_df = sqlContext.createDataFrame(pandas_events, schema=events_schema)
activity_stream_stats_daily_df = sqlContext.createDataFrame(pandas_stats, schema=stats_schema)


# In[6]:

sqlContext.registerDataFrameAsTable(activity_stream_events_daily_df, "activity_stream_events_daily")
sqlContext.registerDataFrameAsTable(activity_stream_stats_daily_df, "activity_stream_stats_daily")


# ## Daily Active Users Power Analysis

# In[48]:

dau = activity_stream_stats_daily_df     .select("date", "client_id")     .groupBy("date")     .agg(countDistinct('client_id'))     .orderBy(desc("date"))     .limit(30)     .select("date", col("count(client_id)").alias("unique_users"))


# In[58]:

dau_stats = dau.describe()
dau_stats.show()

stddev_dau = dau_stats.select("unique_users").collect()[2].unique_users
mean_dau = dau_stats.select("unique_users").collect()[1].unique_users


# In[59]:

diff_in_means =  []
sample_sizes = []
percent_diff = 0.04

while (percent_diff <= 0.20):
    percent_diff += 0.001
    effect_size = (percent_diff * float(mean_dau)) / float(stddev_dau)
    sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')

    diff_in_means.append(str(percent_diff * 100) + "%")
    sample_sizes.append(sample_size)


# In[62]:

data = [go.Scatter(
            x = diff_in_means,
            y = sample_sizes
        )]

layout = dict(title = 'DAU Power Analysis - Alpha = 0.05. Power = 0.95',
              xaxis = dict(title = 'Percent Difference in Means (Daily Active Users)'),
              yaxis = dict(title = 'Sample Size (Number of Days)'))

fig = dict(data=data, layout=layout)
py.iplot(fig)


# ##### DAU Plot: https://plot.ly/~emtwo/87/

# ## Conclusion:
#
# #### Need to run the experiment for ~60 days to observe a 9% difference (~674 DAU).

# ## Daily Tabs Open Power Analysis

# In[72]:

# Schema: |Date|Tabs Open Count|
newtab_counts = activity_stream_stats_daily_df     .select("date", "load_reason")     .where(col("load_reason") == "newtab")     .groupBy("date").count()     .select("date", col("count").alias("num_tabs"))     .orderBy(desc("date"))     .limit(30)


# In[73]:

newtab_stats = newtab_counts.describe()
newtab_stats.show()

stddev_tab_count = newtab_stats.select("num_tabs").collect()[2].num_tabs
mean_tab_count = newtab_stats.select("num_tabs").collect()[1].num_tabs


# In[74]:

diff_in_means =  []
sample_sizes = []
percent_diff = 0.01

while (percent_diff <= 0.15):
    percent_diff += 0.001
    effect_size = (percent_diff * float(mean_tab_count)) / float(stddev_tab_count)
    sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')

    diff_in_means.append(str(percent_diff * 100) + "%")
    sample_sizes.append(sample_size)


# In[75]:

data = [go.Scatter(
            x = diff_in_means,
            y = sample_sizes
        )]

layout = dict(title = 'Daily Open Tabs Power Analysis - Alpha = 0.05. Power = 0.95',
              xaxis = dict(title = 'Percent Difference in Means (Tab Opens Per Day)'),
              yaxis = dict(title = 'Sample Size (Number of Days)'))

fig = dict(data=data, layout=layout)
py.iplot(fig)


# #### Daily Open Tabs Plot: https://plot.ly/~emtwo/95/

# ## Conclusion:
#
# #### 1) Need to run the experiment for ~103 days to observe a 10% difference (~6000 new tabs).
# #### 2) Need to run the experiment for ~60 days to observe a 13% difference (~7900 new tabs).
# #### 3) It's unlikely we will observe a significant difference in newtabs open per day.

# ## Daily Tabs Open Per User Power Analysis

# In[67]:

july_fifteen = time.mktime(datetime.datetime.strptime('15/07/2016', "%d/%m/%Y").timetuple())
aug_fifteen = time.mktime(datetime.datetime.strptime('15/08/2016', "%d/%m/%Y").timetuple())


# In[68]:

newtab_counts = activity_stream_stats_daily_df     .select("client_id", "receive_at", "load_reason")     .withColumn("date", unix_timestamp("receive_at"))     .where(col("load_reason") == "newtab")     .where(col("date") >= july_fifteen)     .where(col("date") < aug_fifteen)     .select("client_id", "load_reason")     .groupBy("client_id").count()     .select("client_id", (col("count") / 31).alias("num_tabs"))     .orderBy("num_tabs")


# In[69]:

newtab_stats = newtab_counts.describe()
newtab_stats.show()

stddev_tab_count = newtab_stats.select("num_tabs").collect()[2].num_tabs
mean_tab_count = newtab_stats.select("num_tabs").collect()[1].num_tabs


# In[70]:

diff_in_means =  []
sample_sizes = []
percent_diff = 0.05

while (percent_diff <= 0.2):
    percent_diff += 0.001
    effect_size = (percent_diff * float(mean_tab_count)) / float(stddev_tab_count)
    sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')

    diff_in_means.append(str(percent_diff * 100) + "%")
    sample_sizes.append(sample_size)


# In[71]:

data = [go.Scatter(
            x = diff_in_means,
            y = sample_sizes
        )]

layout = dict(title = 'Daily Tabs Open Per User Power Analysis - Alpha = 0.05. Power = 0.95',
              xaxis = dict(title = 'Percent Difference in Means (Tab Opens Per User Per Day)'),
              yaxis = dict(title = 'Sample Size (Number of Users)'))

fig = dict(data=data, layout=layout)
py.iplot(fig)


# #### Daily Tabs Open Per User Plot: https://plot.ly/~emtwo/93/

# ## Conclusion:
#
# #### Need ~8000 users per arm to observe a 10% difference (~0.324 new tabs per user per day).

# ## Daily Searches Power Analysis

# In[76]:

search_counts = activity_stream_events_daily_df     .select("client_id", "receive_at", "event")     .withColumn("date", unix_timestamp("receive_at"))     .where(col("event") == "SEARCH")     .where(col("date") >= july_fifteen)     .where(col("date") < aug_fifteen)     .groupBy("client_id").count()     .select("client_id", (col("count") / 31).alias("num_search"))     .orderBy(desc("num_search"))


# In[77]:

search_stats = search_counts.describe()
search_stats.show()

stddev_search_count = search_stats.select("num_search").collect()[2].num_search
mean_search_count = search_stats.select("num_search").collect()[1].num_search


# In[78]:

diff_in_means =  []
sample_sizes = []
percent_diff = 0.05

while (percent_diff <= 0.2):
    percent_diff += 0.001
    effect_size = (percent_diff * float(mean_search_count)) / float(stddev_search_count)
    sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')

    diff_in_means.append(str(percent_diff * 100) + "%")
    sample_sizes.append(sample_size)


# In[79]:

data = [go.Scatter(
            x = diff_in_means,
            y = sample_sizes
        )]

layout = dict(title = 'Daily Searches Power Analysis - Alpha = 0.05. Power = 0.95',
              xaxis = dict(title = 'Percent Difference in Means (Searches Per User Per Day)'),
              yaxis = dict(title = 'Sample Size (Number of Users)'))

fig = dict(data=data, layout=layout)
py.iplot(fig)


# #### Daily Searches Per User Plot: https://plot.ly/~emtwo/97/

# ## Conclusion:
#
# #### Need ~8114 users per arm to observe a 12% difference (~0.057 searches per user per day).

# ## Topsites Clicks Power Analysis

# In[83]:

topsites_clickthrough = activity_stream_events_daily_df     .select("client_id", "receive_at", "event", "source")     .withColumn("date", unix_timestamp("receive_at"))     .where(col("event") == "CLICK")     .where(col("source") == "TOP_SITES")     .where(col("date") >= july_fifteen)     .where(col("date") < aug_fifteen)     .groupBy("client_id").count()     .select("client_id", (col("count") / 31).alias("num_clicks"))


# In[84]:

topsites_stats = topsites_clickthrough.describe()
topsites_stats.show()

stddev_topsites = topsites_stats.select("num_clicks").collect()[2].num_clicks
mean_topsites_count = topsites_stats.select("num_clicks").collect()[1].num_clicks


# In[85]:

diff_in_means =  []
sample_sizes = []
percent_diff = 0.05

while (percent_diff <= 0.2):
    percent_diff += 0.001
    effect_size = (percent_diff * float(mean_topsites_count)) / float(stddev_topsites)
    sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')

    diff_in_means.append(str(percent_diff * 100) + "%")
    sample_sizes.append(sample_size)


# In[86]:

data = [go.Scatter(
            x = diff_in_means,
            y = sample_sizes
        )]

layout = dict(title = 'Topsite Clicks Power Analysis - Alpha = 0.05. Power = 0.95',
              xaxis = dict(title = 'Percent Difference in Means (Topsite Clicks Per User Per Day)'),
              yaxis = dict(title = 'Sample Size (Number of Users)'))

fig = dict(data=data, layout=layout)
py.iplot(fig)


# #### Topsite Clicks Plot: https://plot.ly/~emtwo/99/

# ## Conclusion:
#
# #### Need ~7360 users per arm to observe a 15% difference (~0.105 topsite clicks per user per day).

	# coding: utf-8

	# ## Import Activity Stream Tables

	# In[1]:

	import time
	import datetime
	import pandas as pd
	from datetime import date
	import plotly.plotly as py
	import plotly.graph_objs as go
	from pyspark.sql.types import *
	from pyspark.sql.functions import *
	import statsmodels.stats.power as smp


	# In[2]:

	activity_stream_events_daily_url = "activity_stream_events_daily.csv"
	activity_stream_stats_daily_url = "activity_stream_stats_daily.csv"


	# In[3]:

	pandas_events = pd.read_csv(activity_stream_events_daily_url, ",")
	pandas_stats = pd.read_csv(activity_stream_stats_daily_url, ",")


	# In[4]:

	integer_types = ["max_scroll_depth", "load_latency", "total_bookmarks", "total_history_size", "session_duration"]

	events_fields = [StructField(field_name, IntegerType(), True) if field_name in integer_types else StructField(field_name, StringType(), True) for field_name in pandas_events.columns]
	stats_fields = [StructField(field_name, IntegerType(), True) if field_name in integer_types else StructField(field_name, StringType(), True) for field_name in pandas_stats.columns]

	events_schema = StructType(events_fields)
	stats_schema = StructType(stats_fields)


	# In[5]:

	activity_stream_events_daily_df = sqlContext.createDataFrame(pandas_events, schema=events_schema)
	activity_stream_stats_daily_df = sqlContext.createDataFrame(pandas_stats, schema=stats_schema)


	# In[6]:

	sqlContext.registerDataFrameAsTable(activity_stream_events_daily_df, "activity_stream_events_daily")
	sqlContext.registerDataFrameAsTable(activity_stream_stats_daily_df, "activity_stream_stats_daily")


	# ## Daily Active Users Power Analysis

	# In[48]:

	dau = activity_stream_stats_daily_df .select("date", "client_id") .groupBy("date") .agg(countDistinct('client_id')) .orderBy(desc("date")) .limit(30) .select("date", col("count(client_id)").alias("unique_users"))


	# In[58]:

	dau_stats = dau.describe()
	dau_stats.show()

	stddev_dau = dau_stats.select("unique_users").collect()[2].unique_users
	mean_dau = dau_stats.select("unique_users").collect()[1].unique_users


	# In[59]:

	diff_in_means = []
	sample_sizes = []
	percent_diff = 0.04

	while (percent_diff <= 0.20):
	percent_diff += 0.001
	effect_size = (percent_diff * float(mean_dau)) / float(stddev_dau)
	sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')

	diff_in_means.append(str(percent_diff * 100) + "%")
	sample_sizes.append(sample_size)


	# In[62]:

	data = [go.Scatter(
	x = diff_in_means,
	y = sample_sizes
	)]

	layout = dict(title = 'DAU Power Analysis - Alpha = 0.05. Power = 0.95',
	xaxis = dict(title = 'Percent Difference in Means (Daily Active Users)'),
	yaxis = dict(title = 'Sample Size (Number of Days)'))

	fig = dict(data=data, layout=layout)
	py.iplot(fig)


	# ##### DAU Plot: https://plot.ly/~emtwo/87/

	# ## Conclusion:
	#
	# #### Need to run the experiment for ~60 days to observe a 9% difference (~674 DAU).

	# ## Daily Tabs Open Power Analysis

	# In[72]:

	# Schema: \|Date\|Tabs Open Count\|
	newtab_counts = activity_stream_stats_daily_df .select("date", "load_reason") .where(col("load_reason") == "newtab") .groupBy("date").count() .select("date", col("count").alias("num_tabs")) .orderBy(desc("date")) .limit(30)


	# In[73]:

	newtab_stats = newtab_counts.describe()
	newtab_stats.show()

	stddev_tab_count = newtab_stats.select("num_tabs").collect()[2].num_tabs
	mean_tab_count = newtab_stats.select("num_tabs").collect()[1].num_tabs


	# In[74]:

	diff_in_means = []
	sample_sizes = []
	percent_diff = 0.01

	while (percent_diff <= 0.15):
	percent_diff += 0.001
	effect_size = (percent_diff * float(mean_tab_count)) / float(stddev_tab_count)
	sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')

	diff_in_means.append(str(percent_diff * 100) + "%")
	sample_sizes.append(sample_size)


	# In[75]:

	data = [go.Scatter(
	x = diff_in_means,
	y = sample_sizes
	)]

	layout = dict(title = 'Daily Open Tabs Power Analysis - Alpha = 0.05. Power = 0.95',
	xaxis = dict(title = 'Percent Difference in Means (Tab Opens Per Day)'),
	yaxis = dict(title = 'Sample Size (Number of Days)'))

	fig = dict(data=data, layout=layout)
	py.iplot(fig)


	# #### Daily Open Tabs Plot: https://plot.ly/~emtwo/95/

	# ## Conclusion:
	#
	# #### 1) Need to run the experiment for ~103 days to observe a 10% difference (~6000 new tabs).
	# #### 2) Need to run the experiment for ~60 days to observe a 13% difference (~7900 new tabs).
	# #### 3) It's unlikely we will observe a significant difference in newtabs open per day.

	# ## Daily Tabs Open Per User Power Analysis

	# In[67]:

	july_fifteen = time.mktime(datetime.datetime.strptime('15/07/2016', "%d/%m/%Y").timetuple())
	aug_fifteen = time.mktime(datetime.datetime.strptime('15/08/2016', "%d/%m/%Y").timetuple())


	# In[68]:

	newtab_counts = activity_stream_stats_daily_df .select("client_id", "receive_at", "load_reason") .withColumn("date", unix_timestamp("receive_at")) .where(col("load_reason") == "newtab") .where(col("date") >= july_fifteen) .where(col("date") < aug_fifteen) .select("client_id", "load_reason") .groupBy("client_id").count() .select("client_id", (col("count") / 31).alias("num_tabs")) .orderBy("num_tabs")


	# In[69]:

	newtab_stats = newtab_counts.describe()
	newtab_stats.show()

	stddev_tab_count = newtab_stats.select("num_tabs").collect()[2].num_tabs
	mean_tab_count = newtab_stats.select("num_tabs").collect()[1].num_tabs


	# In[70]:

	diff_in_means = []
	sample_sizes = []
	percent_diff = 0.05

	while (percent_diff <= 0.2):
	percent_diff += 0.001
	effect_size = (percent_diff * float(mean_tab_count)) / float(stddev_tab_count)
	sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')

	diff_in_means.append(str(percent_diff * 100) + "%")
	sample_sizes.append(sample_size)


	# In[71]:

	data = [go.Scatter(
	x = diff_in_means,
	y = sample_sizes
	)]

	layout = dict(title = 'Daily Tabs Open Per User Power Analysis - Alpha = 0.05. Power = 0.95',
	xaxis = dict(title = 'Percent Difference in Means (Tab Opens Per User Per Day)'),
	yaxis = dict(title = 'Sample Size (Number of Users)'))

	fig = dict(data=data, layout=layout)
	py.iplot(fig)


	# #### Daily Tabs Open Per User Plot: https://plot.ly/~emtwo/93/

	# ## Conclusion:
	#
	# #### Need ~8000 users per arm to observe a 10% difference (~0.324 new tabs per user per day).

	# ## Daily Searches Power Analysis

	# In[76]:

	search_counts = activity_stream_events_daily_df .select("client_id", "receive_at", "event") .withColumn("date", unix_timestamp("receive_at")) .where(col("event") == "SEARCH") .where(col("date") >= july_fifteen) .where(col("date") < aug_fifteen) .groupBy("client_id").count() .select("client_id", (col("count") / 31).alias("num_search")) .orderBy(desc("num_search"))


	# In[77]:

	search_stats = search_counts.describe()
	search_stats.show()

	stddev_search_count = search_stats.select("num_search").collect()[2].num_search
	mean_search_count = search_stats.select("num_search").collect()[1].num_search


	# In[78]:

	diff_in_means = []
	sample_sizes = []
	percent_diff = 0.05

	while (percent_diff <= 0.2):
	percent_diff += 0.001
	effect_size = (percent_diff * float(mean_search_count)) / float(stddev_search_count)
	sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')

	diff_in_means.append(str(percent_diff * 100) + "%")
	sample_sizes.append(sample_size)


	# In[79]:

	data = [go.Scatter(
	x = diff_in_means,
	y = sample_sizes
	)]

	layout = dict(title = 'Daily Searches Power Analysis - Alpha = 0.05. Power = 0.95',
	xaxis = dict(title = 'Percent Difference in Means (Searches Per User Per Day)'),
	yaxis = dict(title = 'Sample Size (Number of Users)'))

	fig = dict(data=data, layout=layout)
	py.iplot(fig)


	# #### Daily Searches Per User Plot: https://plot.ly/~emtwo/97/

	# ## Conclusion:
	#
	# #### Need ~8114 users per arm to observe a 12% difference (~0.057 searches per user per day).

	# ## Topsites Clicks Power Analysis

	# In[83]:

	topsites_clickthrough = activity_stream_events_daily_df .select("client_id", "receive_at", "event", "source") .withColumn("date", unix_timestamp("receive_at")) .where(col("event") == "CLICK") .where(col("source") == "TOP_SITES") .where(col("date") >= july_fifteen) .where(col("date") < aug_fifteen) .groupBy("client_id").count() .select("client_id", (col("count") / 31).alias("num_clicks"))


	# In[84]:

	topsites_stats = topsites_clickthrough.describe()
	topsites_stats.show()

	stddev_topsites = topsites_stats.select("num_clicks").collect()[2].num_clicks
	mean_topsites_count = topsites_stats.select("num_clicks").collect()[1].num_clicks


	# In[85]:

	diff_in_means = []
	sample_sizes = []
	percent_diff = 0.05

	while (percent_diff <= 0.2):
	percent_diff += 0.001
	effect_size = (percent_diff * float(mean_topsites_count)) / float(stddev_topsites)
	sample_size = smp.TTestIndPower().solve_power(effect_size, power=0.95, alpha=0.05, alternative='two-sided')

	diff_in_means.append(str(percent_diff * 100) + "%")
	sample_sizes.append(sample_size)


	# In[86]:

	data = [go.Scatter(
	x = diff_in_means,
	y = sample_sizes
	)]

	layout = dict(title = 'Topsite Clicks Power Analysis - Alpha = 0.05. Power = 0.95',
	xaxis = dict(title = 'Percent Difference in Means (Topsite Clicks Per User Per Day)'),
	yaxis = dict(title = 'Sample Size (Number of Users)'))

	fig = dict(data=data, layout=layout)
	py.iplot(fig)


	# #### Topsite Clicks Plot: https://plot.ly/~emtwo/99/

	# ## Conclusion:
	#
	# #### Need ~7360 users per arm to observe a 15% difference (~0.105 topsite clicks per user per day).