Skip to content

Instantly share code, notes, and snippets.

@aychen5
Created January 30, 2021 12:30
Show Gist options
  • Save aychen5/0e77b99246ee59a65e8de24f7c46a1b4 to your computer and use it in GitHub Desktop.
Save aychen5/0e77b99246ee59a65e8de24f7c46a1b4 to your computer and use it in GitHub Desktop.
using Distributed
numCore = 4;
addprocs(numCore - 1);
#rmprocs(workers());
#%%
# PARAMETERS
# alpha: Pr(false positive)
# starting sample size: initial HITs distributed
# effect size:
#%%
@everywhere begin
using Random, StatsBase, Distributions, KernelDensity
using RCall
using GLM
using DataFrames, CategoricalArrays
using ProgressMeter, Distributed, FLoops
end
@everywhere begin
Random.seed!(2021)
"""
Perform random assignment (individual or block) given treated and untreated potential outcomes.
Outputs the pvalue from a hypothesis test via OLS.
"""
function randomization_scheme(Y0_sample, Y1_sample, sample_size, randomization_type::String, n_blocks = nothing)
#D = rand([0,1], sample_size)
if randomization_type == "individual"
# Treatment assignment vector
D = rand([0,1], sample_size)
elseif randomization_type == "block"
B_size = convert(Int64, round(sample_size ./ n_blocks, digits=0))
B_id = collect(1:n_blocks)
#random assignment within blocks
D = vcat([rand([0,1], B_size) for i in 1:length(B_id)]...)
#rounding error shenanigans
D = D[1:length(Y0_sample)]
end
Y = (Y0_sample .* (1 .- D)) .+ (Y1_sample .* D)
# test
if randomization_type == "individual"
df = DataFrame(D = D, Y = Y)
ols_result = lm(@formula(Y ~ D), df)
elseif randomization_type == "block"
df = DataFrame(D = D, Y = Y, B_j = repeat(B_id, inner = B_size)[1:length(Y0_sample)])
ols_result = lm(@formula(Y ~ D + B_j), df)
end
pval = coeftable(ols_result).cols[4][2]
return pval
end
"""
Calculate power based on how many initial HITs we start with and a given effect size.
Hard-coded parameters for reducing the sample size:
- in the U.S. (already accounted for in MTurk)
- pass attention check
- good open-ended response
- has a Twitter account
- frequent Twitter user
"""
function power_function(HIT_size, effect_size, n_sims, alpha, randomization_type, n_blocks = 2)
# Declare population
# population is all mturk users
N = 10^4
# Population potential outcomes
Y0 = rand(Normal(0, 1), N)
#Y1 = Y0 .+ rand(Normal(.2, 1), N)
# sampling process in MTurk
n_1 = HIT_size .* 0.9
n_2 = n_1 .* 0.65
n_turk_twitter = convert(Int64, round(n_2 .* 0.55, digits=0))
Y0_sample = sample(Y0, n_turk_twitter, replace = false)
if randomization_type == "individual"# sampling from the mturk twitter population
Y1_sample = Y0_sample .+ effect_size
#supposing blocks
elseif randomization_type == "block"
# different magnitude of effect in each block
effect_size_j = rand(truncated(Normal(effect_size, .1), 0, Inf), n_blocks)
B_size = convert(Int64, round(n_turk_twitter ./ n_blocks, digits=0))
effect_size_j = repeat(effect_size_j, inner = B_size)
if length(effect_size_j) < length(Y0_sample)
Y0_sample = Y0_sample[1:length(effect_size_j)]
else
effect_size_j = effect_size_j[1:length(Y0_sample)]
end
Y1_sample = Y0_sample .+ effect_size_j
end
# generate pvalues
if randomization_type == "individual"
sim_pvalues = [randomization_scheme(Y0_sample, Y1_sample, n_turk_twitter, "individual") for i in 1:n_sims]
elseif randomization_type == "block"
sim_pvalues = [randomization_scheme(Y0_sample, Y1_sample, n_turk_twitter, "block", 2) for i in 1:n_sims]
end
# the proportion of pvalues less than alpha decision rule
percent_detected = length(sim_pvalues[findall(sim_pvalues .< alpha)]) ./ length(sim_pvalues)
return [percent_detected, n_turk_twitter]
end
"""
CloudResearch/Turk Prime:
- microbatching (run HITs sequentially) OR hyperbatching (runs all HITs simultaneously)
- anonymized worker id
- enhanced privacy
"""
function mturk_cost_function(sample_size)
# cost parameters
#length_of_survey = ?
pay_per_participant = 1
amazon_commission = .2
# starting 2021, academic accounts also charged 10% of worker's compensation
cloud_research_fee = .10 .* pay_per_participant
pro_features_fee = .25 .* pay_per_participant
total_cost = (sample_size .* pay_per_participant) .+ (sample_size .* (pay_per_participant .* amazon_commission)) .+ (sample_size .* (pay_per_participant .* pro_features_fee)) .+ (sample_size .* (pay_per_participant .* cloud_research_fee))
return total_cost
end
"""
Take power and calculate the MDES.
Assume proportion treated and untreated are equal.
Two-tailed test.
"""
function mdes_function(effect_size, alpha, power, sample_size, randomization_type,
n_blocks = nothing, R_squared = nothing)
sd_y = 1
p_0 = .5
p_1 = .5
if randomization_type == "individual"
t_dist = TDist(sample_size .- 2)
elseif randomization_type == "block"
t_dist = TDist(sample_size .- n_blocks .- 1)
end
t_alpha = quantile(t_dist, 1 .- (alpha ./ 2))
t_pwr = quantile(t_dist, power)
multiplier = t_alpha .+ t_pwr
if randomization_type == "individual"
mdes = multiplier .* (sd_y / sqrt(sample_size .* (p_0 .* p_1)))
elseif randomization_type == "block"
mdes = multiplier .* ((1 .- R_squared) ./ sqrt(sample_size .* (p_0 .* p_1)))
end
return mdes
end
"""
For each sample size, repeat process X times and take average power.
Output starting sample size and reduced sample.
"""
function sim_power_function(HIT_size, effect_size, randomization_type, quantity = "power", mdes_power = nothing)
if quantity == "power"
average_power = mean([power_function(HIT_size, effect_size, 1000, 0.05, randomization_type)[1] for i in 1:100])
filtered_sample_size = power_function(HIT_size, effect_size, 1000, 0.05, randomization_type)[2]
cost = mturk_cost_function(HIT_size)
out = [average_power, HIT_size, filtered_sample_size, effect_size, cost]
elseif quantity == "mdes"
mdes = mdes_function(effect_size, .05, mdes_power, filtered_sample_size, "block")
cost = mturk_cost_function(HIT_size)
out = [mdes, HIT_size, filtered_sample_size, mdes_power, effect_size, cost]
end
return out
end
end
############## POWER #####################################
alpha = 0.05
sample_sizes = collect( range(100, stop = 5000, step = 100) )
effect_sizes = collect( [0.1, 0.25, 0.65] )
power_sims_ind = []
for i in 1:length(effect_sizes)
push!(power_sims_ind,
@showprogress 1 "Pls be patient..." pmap(p -> sim_power_function(p, effect_sizes[i], "individual"), sample_sizes))
end
power_sims_block = []
for i in 1:length(effect_sizes)
push!(power_sims_block,
@showprogress 1 "Pls be patient..." pmap(p -> sim_power_function(p, effect_sizes[i], "block"), sample_sizes))
end
############# MDES ######################################
mdes_power = collect( range(0, stop = 1, step = .1) )
mdes_sims_block = []
for i in 1:length(mdes_power)
push!(mdes_sims_block,
@showprogress 1 "Pls be patient..." pmap(p -> sim_power_function(p, effect_sizes[i], "block"), sample_sizes))
end
############# STACK SIMS ######################################
function power_in_df(data)
df = DataFrame(data)
df.variable_type = ["power", "initial_sample_size", "reduced_sample_size", "effect", "cost"]
#reshape
out = unstack(stack(df, 1:length(sample_sizes)), :variable_type, :value)
# to categorical data
out = categorical(out, :effect)
return out
end
all_power_sims = vcat(power_in_df(power_sims_ind[1]),
power_in_df(power_sims_ind[2]),
power_in_df(power_sims_ind[3]))
all_power_sims_block = vcat(power_in_df(power_sims_block[1]),
power_in_df(power_sims_block[2]),
power_in_df(power_sims_block[3]))
print(all_power_sims)
############# PLOT ######################################
@rlibrary ggplot2
ggplot(all_power_sims) +
geom_line(aes(x = :reduced_sample_size, y = :power, color = :effect, group = :effect)) +
theme_bw()
ggplot(all_power_sims_block) +
geom_line(aes(x = :reduced_sample_size, y = :power, color = :effect, group = :effect)) +
theme_bw()
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment