Created
January 30, 2021 12:30
-
-
Save aychen5/0e77b99246ee59a65e8de24f7c46a1b4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Distributed | |
numCore = 4; | |
addprocs(numCore - 1); | |
#rmprocs(workers()); | |
#%% | |
# PARAMETERS | |
# alpha: Pr(false positive) | |
# starting sample size: initial HITs distributed | |
# effect size: | |
#%% | |
@everywhere begin | |
using Random, StatsBase, Distributions, KernelDensity | |
using RCall | |
using GLM | |
using DataFrames, CategoricalArrays | |
using ProgressMeter, Distributed, FLoops | |
end | |
@everywhere begin | |
Random.seed!(2021) | |
""" | |
Perform random assignment (individual or block) given treated and untreated potential outcomes. | |
Outputs the pvalue from a hypothesis test via OLS. | |
""" | |
function randomization_scheme(Y0_sample, Y1_sample, sample_size, randomization_type::String, n_blocks = nothing) | |
#D = rand([0,1], sample_size) | |
if randomization_type == "individual" | |
# Treatment assignment vector | |
D = rand([0,1], sample_size) | |
elseif randomization_type == "block" | |
B_size = convert(Int64, round(sample_size ./ n_blocks, digits=0)) | |
B_id = collect(1:n_blocks) | |
#random assignment within blocks | |
D = vcat([rand([0,1], B_size) for i in 1:length(B_id)]...) | |
#rounding error shenanigans | |
D = D[1:length(Y0_sample)] | |
end | |
Y = (Y0_sample .* (1 .- D)) .+ (Y1_sample .* D) | |
# test | |
if randomization_type == "individual" | |
df = DataFrame(D = D, Y = Y) | |
ols_result = lm(@formula(Y ~ D), df) | |
elseif randomization_type == "block" | |
df = DataFrame(D = D, Y = Y, B_j = repeat(B_id, inner = B_size)[1:length(Y0_sample)]) | |
ols_result = lm(@formula(Y ~ D + B_j), df) | |
end | |
pval = coeftable(ols_result).cols[4][2] | |
return pval | |
end | |
""" | |
Calculate power based on how many initial HITs we start with and a given effect size. | |
Hard-coded parameters for reducing the sample size: | |
- in the U.S. (already accounted for in MTurk) | |
- pass attention check | |
- good open-ended response | |
- has a Twitter account | |
- frequent Twitter user | |
""" | |
function power_function(HIT_size, effect_size, n_sims, alpha, randomization_type, n_blocks = 2) | |
# Declare population | |
# population is all mturk users | |
N = 10^4 | |
# Population potential outcomes | |
Y0 = rand(Normal(0, 1), N) | |
#Y1 = Y0 .+ rand(Normal(.2, 1), N) | |
# sampling process in MTurk | |
n_1 = HIT_size .* 0.9 | |
n_2 = n_1 .* 0.65 | |
n_turk_twitter = convert(Int64, round(n_2 .* 0.55, digits=0)) | |
Y0_sample = sample(Y0, n_turk_twitter, replace = false) | |
if randomization_type == "individual"# sampling from the mturk twitter population | |
Y1_sample = Y0_sample .+ effect_size | |
#supposing blocks | |
elseif randomization_type == "block" | |
# different magnitude of effect in each block | |
effect_size_j = rand(truncated(Normal(effect_size, .1), 0, Inf), n_blocks) | |
B_size = convert(Int64, round(n_turk_twitter ./ n_blocks, digits=0)) | |
effect_size_j = repeat(effect_size_j, inner = B_size) | |
if length(effect_size_j) < length(Y0_sample) | |
Y0_sample = Y0_sample[1:length(effect_size_j)] | |
else | |
effect_size_j = effect_size_j[1:length(Y0_sample)] | |
end | |
Y1_sample = Y0_sample .+ effect_size_j | |
end | |
# generate pvalues | |
if randomization_type == "individual" | |
sim_pvalues = [randomization_scheme(Y0_sample, Y1_sample, n_turk_twitter, "individual") for i in 1:n_sims] | |
elseif randomization_type == "block" | |
sim_pvalues = [randomization_scheme(Y0_sample, Y1_sample, n_turk_twitter, "block", 2) for i in 1:n_sims] | |
end | |
# the proportion of pvalues less than alpha decision rule | |
percent_detected = length(sim_pvalues[findall(sim_pvalues .< alpha)]) ./ length(sim_pvalues) | |
return [percent_detected, n_turk_twitter] | |
end | |
""" | |
CloudResearch/Turk Prime: | |
- microbatching (run HITs sequentially) OR hyperbatching (runs all HITs simultaneously) | |
- anonymized worker id | |
- enhanced privacy | |
""" | |
function mturk_cost_function(sample_size) | |
# cost parameters | |
#length_of_survey = ? | |
pay_per_participant = 1 | |
amazon_commission = .2 | |
# starting 2021, academic accounts also charged 10% of worker's compensation | |
cloud_research_fee = .10 .* pay_per_participant | |
pro_features_fee = .25 .* pay_per_participant | |
total_cost = (sample_size .* pay_per_participant) .+ (sample_size .* (pay_per_participant .* amazon_commission)) .+ (sample_size .* (pay_per_participant .* pro_features_fee)) .+ (sample_size .* (pay_per_participant .* cloud_research_fee)) | |
return total_cost | |
end | |
""" | |
Take power and calculate the MDES. | |
Assume proportion treated and untreated are equal. | |
Two-tailed test. | |
""" | |
function mdes_function(effect_size, alpha, power, sample_size, randomization_type, | |
n_blocks = nothing, R_squared = nothing) | |
sd_y = 1 | |
p_0 = .5 | |
p_1 = .5 | |
if randomization_type == "individual" | |
t_dist = TDist(sample_size .- 2) | |
elseif randomization_type == "block" | |
t_dist = TDist(sample_size .- n_blocks .- 1) | |
end | |
t_alpha = quantile(t_dist, 1 .- (alpha ./ 2)) | |
t_pwr = quantile(t_dist, power) | |
multiplier = t_alpha .+ t_pwr | |
if randomization_type == "individual" | |
mdes = multiplier .* (sd_y / sqrt(sample_size .* (p_0 .* p_1))) | |
elseif randomization_type == "block" | |
mdes = multiplier .* ((1 .- R_squared) ./ sqrt(sample_size .* (p_0 .* p_1))) | |
end | |
return mdes | |
end | |
""" | |
For each sample size, repeat process X times and take average power. | |
Output starting sample size and reduced sample. | |
""" | |
function sim_power_function(HIT_size, effect_size, randomization_type, quantity = "power", mdes_power = nothing) | |
if quantity == "power" | |
average_power = mean([power_function(HIT_size, effect_size, 1000, 0.05, randomization_type)[1] for i in 1:100]) | |
filtered_sample_size = power_function(HIT_size, effect_size, 1000, 0.05, randomization_type)[2] | |
cost = mturk_cost_function(HIT_size) | |
out = [average_power, HIT_size, filtered_sample_size, effect_size, cost] | |
elseif quantity == "mdes" | |
mdes = mdes_function(effect_size, .05, mdes_power, filtered_sample_size, "block") | |
cost = mturk_cost_function(HIT_size) | |
out = [mdes, HIT_size, filtered_sample_size, mdes_power, effect_size, cost] | |
end | |
return out | |
end | |
end | |
############## POWER ##################################### | |
alpha = 0.05 | |
sample_sizes = collect( range(100, stop = 5000, step = 100) ) | |
effect_sizes = collect( [0.1, 0.25, 0.65] ) | |
power_sims_ind = [] | |
for i in 1:length(effect_sizes) | |
push!(power_sims_ind, | |
@showprogress 1 "Pls be patient..." pmap(p -> sim_power_function(p, effect_sizes[i], "individual"), sample_sizes)) | |
end | |
power_sims_block = [] | |
for i in 1:length(effect_sizes) | |
push!(power_sims_block, | |
@showprogress 1 "Pls be patient..." pmap(p -> sim_power_function(p, effect_sizes[i], "block"), sample_sizes)) | |
end | |
############# MDES ###################################### | |
mdes_power = collect( range(0, stop = 1, step = .1) ) | |
mdes_sims_block = [] | |
for i in 1:length(mdes_power) | |
push!(mdes_sims_block, | |
@showprogress 1 "Pls be patient..." pmap(p -> sim_power_function(p, effect_sizes[i], "block"), sample_sizes)) | |
end | |
############# STACK SIMS ###################################### | |
function power_in_df(data) | |
df = DataFrame(data) | |
df.variable_type = ["power", "initial_sample_size", "reduced_sample_size", "effect", "cost"] | |
#reshape | |
out = unstack(stack(df, 1:length(sample_sizes)), :variable_type, :value) | |
# to categorical data | |
out = categorical(out, :effect) | |
return out | |
end | |
all_power_sims = vcat(power_in_df(power_sims_ind[1]), | |
power_in_df(power_sims_ind[2]), | |
power_in_df(power_sims_ind[3])) | |
all_power_sims_block = vcat(power_in_df(power_sims_block[1]), | |
power_in_df(power_sims_block[2]), | |
power_in_df(power_sims_block[3])) | |
print(all_power_sims) | |
############# PLOT ###################################### | |
@rlibrary ggplot2 | |
ggplot(all_power_sims) + | |
geom_line(aes(x = :reduced_sample_size, y = :power, color = :effect, group = :effect)) + | |
theme_bw() | |
ggplot(all_power_sims_block) + | |
geom_line(aes(x = :reduced_sample_size, y = :power, color = :effect, group = :effect)) + | |
theme_bw() | |
# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment