aychen5/mturk_power_sim.jl

## mturk_power_sim.jl
using Distributed
numCore = 4;
addprocs(numCore - 1);
#rmprocs(workers());

#%%
# PARAMETERS
# alpha: Pr(false positive)
# starting sample size: initial HITs distributed
# effect size:
#%%


@everywhere begin
    using Random, StatsBase, Distributions, KernelDensity
    using RCall
    using GLM
    using DataFrames, CategoricalArrays
    using ProgressMeter, Distributed, FLoops
end

@everywhere begin
    Random.seed!(2021)

    """
    Perform random assignment (individual or block) given treated and untreated potential outcomes.
    Outputs the pvalue from a hypothesis test via OLS.
    """
    function randomization_scheme(Y0_sample, Y1_sample, sample_size, randomization_type::String, n_blocks = nothing)
        #D = rand([0,1], sample_size)
        if randomization_type == "individual"
            # Treatment assignment vector
            D = rand([0,1], sample_size)
        elseif randomization_type == "block"
            B_size = convert(Int64, round(sample_size ./ n_blocks, digits=0))
            B_id = collect(1:n_blocks)
            #random assignment within blocks
            D = vcat([rand([0,1], B_size) for i in 1:length(B_id)]...)
            #rounding error shenanigans
            D = D[1:length(Y0_sample)]
        end

        Y = (Y0_sample .* (1 .- D)) .+ (Y1_sample .* D)

        # test
        if randomization_type == "individual"
            df = DataFrame(D = D, Y = Y)
            ols_result = lm(@formula(Y ~ D), df)
        elseif randomization_type == "block"
            df = DataFrame(D = D, Y = Y, B_j = repeat(B_id, inner = B_size)[1:length(Y0_sample)])
            ols_result = lm(@formula(Y ~ D + B_j), df)
        end
        pval = coeftable(ols_result).cols[4][2]
        return pval
    end

    """
    Calculate power based on how many initial HITs we start with and a given effect size.
    Hard-coded parameters for reducing the sample size:
    - in the U.S. (already accounted for in MTurk)
    - pass attention check
    - good open-ended response
    - has a Twitter account
    - frequent Twitter user
    """
    function power_function(HIT_size, effect_size, n_sims, alpha, randomization_type, n_blocks = 2)
        # Declare population
        # population is all mturk users
        N = 10^4

        # Population potential outcomes
        Y0 = rand(Normal(0, 1), N)
        #Y1 = Y0 .+ rand(Normal(.2, 1), N)

        # sampling process in MTurk
        n_1 = HIT_size .* 0.9
        n_2 = n_1 .* 0.65
        n_turk_twitter = convert(Int64, round(n_2 .* 0.55, digits=0))

        Y0_sample = sample(Y0, n_turk_twitter, replace = false)

        if randomization_type == "individual"# sampling from the mturk twitter population
            Y1_sample = Y0_sample .+ effect_size
        #supposing blocks
        elseif randomization_type == "block"
            # different magnitude of effect in each block
            effect_size_j = rand(truncated(Normal(effect_size, .1), 0, Inf), n_blocks)
            B_size =  convert(Int64, round(n_turk_twitter ./ n_blocks, digits=0))
            effect_size_j = repeat(effect_size_j, inner = B_size)
            if length(effect_size_j) < length(Y0_sample)
                Y0_sample = Y0_sample[1:length(effect_size_j)]
            else
                effect_size_j = effect_size_j[1:length(Y0_sample)]
            end
            Y1_sample = Y0_sample .+ effect_size_j
        end

        # generate pvalues
        if randomization_type == "individual"
            sim_pvalues = [randomization_scheme(Y0_sample, Y1_sample, n_turk_twitter, "individual") for i in 1:n_sims]
        elseif randomization_type == "block"
            sim_pvalues = [randomization_scheme(Y0_sample, Y1_sample, n_turk_twitter, "block", 2) for i in 1:n_sims]
        end
        # the proportion of pvalues less than alpha decision rule
        percent_detected = length(sim_pvalues[findall(sim_pvalues .< alpha)]) ./ length(sim_pvalues)

        return [percent_detected, n_turk_twitter]
    end

    """
    CloudResearch/Turk Prime:
    - microbatching (run HITs sequentially) OR hyperbatching (runs all HITs simultaneously)
    - anonymized worker id
    - enhanced privacy
    """
    function mturk_cost_function(sample_size)
        # cost parameters
        #length_of_survey = ?
        pay_per_participant = 1
        amazon_commission = .2
        # starting 2021, academic accounts also charged 10% of worker's compensation
        cloud_research_fee = .10 .* pay_per_participant
        pro_features_fee = .25 .* pay_per_participant

        total_cost = (sample_size .* pay_per_participant) .+ (sample_size .* (pay_per_participant .* amazon_commission)) .+ (sample_size .* (pay_per_participant .* pro_features_fee)) .+ (sample_size .* (pay_per_participant .* cloud_research_fee))
        return total_cost
    end

    """
    Take power and calculate the MDES.
    Assume proportion treated and untreated are equal.
    Two-tailed test.
    """
    function mdes_function(effect_size, alpha, power, sample_size, randomization_type,
                           n_blocks = nothing, R_squared = nothing)
        sd_y = 1
        p_0 = .5
        p_1 = .5
        if randomization_type == "individual"
            t_dist = TDist(sample_size .- 2)
        elseif randomization_type == "block"
            t_dist = TDist(sample_size .- n_blocks .- 1)
        end

        t_alpha = quantile(t_dist, 1 .- (alpha ./ 2))
        t_pwr = quantile(t_dist, power)
        multiplier = t_alpha .+ t_pwr

        if randomization_type == "individual"
            mdes = multiplier .* (sd_y / sqrt(sample_size .* (p_0 .* p_1)))
        elseif randomization_type == "block"
            mdes = multiplier .* ((1 .- R_squared) ./ sqrt(sample_size .* (p_0 .* p_1)))
        end
        return mdes
    end

    """
    For each sample size, repeat process X times and take average power.
    Output starting sample size and reduced sample.
    """
    function sim_power_function(HIT_size, effect_size, randomization_type, quantity = "power", mdes_power = nothing)
        if quantity == "power"
            average_power = mean([power_function(HIT_size, effect_size, 1000, 0.05, randomization_type)[1] for i in 1:100])
            filtered_sample_size = power_function(HIT_size, effect_size, 1000, 0.05, randomization_type)[2]
            cost = mturk_cost_function(HIT_size)
            out = [average_power, HIT_size, filtered_sample_size, effect_size, cost]
        elseif quantity == "mdes"
            mdes = mdes_function(effect_size, .05, mdes_power, filtered_sample_size, "block")
            cost = mturk_cost_function(HIT_size)
            out = [mdes, HIT_size, filtered_sample_size, mdes_power, effect_size, cost]
        end
        return out
    end
end

############## POWER #####################################
alpha = 0.05
sample_sizes = collect( range(100, stop = 5000, step = 100) )
effect_sizes = collect( [0.1, 0.25, 0.65] )

power_sims_ind = []
for i in 1:length(effect_sizes)
    push!(power_sims_ind,
          @showprogress 1 "Pls be patient..." pmap(p -> sim_power_function(p, effect_sizes[i], "individual"), sample_sizes))
end

power_sims_block = []
for i in 1:length(effect_sizes)
    push!(power_sims_block,
          @showprogress 1 "Pls be patient..." pmap(p -> sim_power_function(p, effect_sizes[i], "block"), sample_sizes))
end
############# MDES ######################################
mdes_power = collect( range(0, stop = 1, step = .1) )
mdes_sims_block = []
for i in 1:length(mdes_power)
    push!(mdes_sims_block,
          @showprogress 1 "Pls be patient..." pmap(p -> sim_power_function(p, effect_sizes[i], "block"), sample_sizes))
end

############# STACK SIMS ######################################
function power_in_df(data)
    df = DataFrame(data)
    df.variable_type = ["power", "initial_sample_size", "reduced_sample_size", "effect", "cost"]
    #reshape
    out = unstack(stack(df, 1:length(sample_sizes)), :variable_type, :value)
    # to categorical data
    out = categorical(out, :effect)
    return out
end
all_power_sims = vcat(power_in_df(power_sims_ind[1]),
                      power_in_df(power_sims_ind[2]),
                      power_in_df(power_sims_ind[3]))

all_power_sims_block = vcat(power_in_df(power_sims_block[1]),
                      power_in_df(power_sims_block[2]),
                      power_in_df(power_sims_block[3]))
print(all_power_sims)

############# PLOT ######################################
@rlibrary ggplot2
ggplot(all_power_sims) +
    geom_line(aes(x = :reduced_sample_size, y = :power, color = :effect, group = :effect)) +
    theme_bw()

ggplot(all_power_sims_block) +
    geom_line(aes(x = :reduced_sample_size, y = :power, color = :effect, group = :effect)) +
    theme_bw()


#
	using Distributed
	numCore = 4;
	addprocs(numCore - 1);
	#rmprocs(workers());

	#%%
	# PARAMETERS
	# alpha: Pr(false positive)
	# starting sample size: initial HITs distributed
	# effect size:
	#%%


	@everywhere begin
	using Random, StatsBase, Distributions, KernelDensity
	using RCall
	using GLM
	using DataFrames, CategoricalArrays
	using ProgressMeter, Distributed, FLoops
	end

	@everywhere begin
	Random.seed!(2021)

	"""
	Perform random assignment (individual or block) given treated and untreated potential outcomes.
	Outputs the pvalue from a hypothesis test via OLS.
	"""
	function randomization_scheme(Y0_sample, Y1_sample, sample_size, randomization_type::String, n_blocks = nothing)
	#D = rand([0,1], sample_size)
	if randomization_type == "individual"
	# Treatment assignment vector
	D = rand([0,1], sample_size)
	elseif randomization_type == "block"
	B_size = convert(Int64, round(sample_size ./ n_blocks, digits=0))
	B_id = collect(1:n_blocks)
	#random assignment within blocks
	D = vcat([rand([0,1], B_size) for i in 1:length(B_id)]...)
	#rounding error shenanigans
	D = D[1:length(Y0_sample)]
	end

	Y = (Y0_sample .* (1 .- D)) .+ (Y1_sample .* D)

	# test
	if randomization_type == "individual"
	df = DataFrame(D = D, Y = Y)
	ols_result = lm(@formula(Y ~ D), df)
	elseif randomization_type == "block"
	df = DataFrame(D = D, Y = Y, B_j = repeat(B_id, inner = B_size)[1:length(Y0_sample)])
	ols_result = lm(@formula(Y ~ D + B_j), df)
	end
	pval = coeftable(ols_result).cols[4][2]
	return pval
	end

	"""
	Calculate power based on how many initial HITs we start with and a given effect size.
	Hard-coded parameters for reducing the sample size:
	- in the U.S. (already accounted for in MTurk)
	- pass attention check
	- good open-ended response
	- has a Twitter account
	- frequent Twitter user
	"""
	function power_function(HIT_size, effect_size, n_sims, alpha, randomization_type, n_blocks = 2)
	# Declare population
	# population is all mturk users
	N = 10^4

	# Population potential outcomes
	Y0 = rand(Normal(0, 1), N)
	#Y1 = Y0 .+ rand(Normal(.2, 1), N)

	# sampling process in MTurk
	n_1 = HIT_size .* 0.9
	n_2 = n_1 .* 0.65
	n_turk_twitter = convert(Int64, round(n_2 .* 0.55, digits=0))

	Y0_sample = sample(Y0, n_turk_twitter, replace = false)

	if randomization_type == "individual"# sampling from the mturk twitter population
	Y1_sample = Y0_sample .+ effect_size
	#supposing blocks
	elseif randomization_type == "block"
	# different magnitude of effect in each block
	effect_size_j = rand(truncated(Normal(effect_size, .1), 0, Inf), n_blocks)
	B_size = convert(Int64, round(n_turk_twitter ./ n_blocks, digits=0))
	effect_size_j = repeat(effect_size_j, inner = B_size)
	if length(effect_size_j) < length(Y0_sample)
	Y0_sample = Y0_sample[1:length(effect_size_j)]
	else
	effect_size_j = effect_size_j[1:length(Y0_sample)]
	end
	Y1_sample = Y0_sample .+ effect_size_j
	end

	# generate pvalues
	if randomization_type == "individual"
	sim_pvalues = [randomization_scheme(Y0_sample, Y1_sample, n_turk_twitter, "individual") for i in 1:n_sims]
	elseif randomization_type == "block"
	sim_pvalues = [randomization_scheme(Y0_sample, Y1_sample, n_turk_twitter, "block", 2) for i in 1:n_sims]
	end
	# the proportion of pvalues less than alpha decision rule
	percent_detected = length(sim_pvalues[findall(sim_pvalues .< alpha)]) ./ length(sim_pvalues)

	return [percent_detected, n_turk_twitter]
	end

	"""
	CloudResearch/Turk Prime:
	- microbatching (run HITs sequentially) OR hyperbatching (runs all HITs simultaneously)
	- anonymized worker id
	- enhanced privacy
	"""
	function mturk_cost_function(sample_size)
	# cost parameters
	#length_of_survey = ?
	pay_per_participant = 1
	amazon_commission = .2
	# starting 2021, academic accounts also charged 10% of worker's compensation
	cloud_research_fee = .10 .* pay_per_participant
	pro_features_fee = .25 .* pay_per_participant

	total_cost = (sample_size .* pay_per_participant) .+ (sample_size .* (pay_per_participant .* amazon_commission)) .+ (sample_size .* (pay_per_participant .* pro_features_fee)) .+ (sample_size .* (pay_per_participant .* cloud_research_fee))
	return total_cost
	end

	"""
	Take power and calculate the MDES.
	Assume proportion treated and untreated are equal.
	Two-tailed test.
	"""
	function mdes_function(effect_size, alpha, power, sample_size, randomization_type,
	n_blocks = nothing, R_squared = nothing)
	sd_y = 1
	p_0 = .5
	p_1 = .5
	if randomization_type == "individual"
	t_dist = TDist(sample_size .- 2)
	elseif randomization_type == "block"
	t_dist = TDist(sample_size .- n_blocks .- 1)
	end

	t_alpha = quantile(t_dist, 1 .- (alpha ./ 2))
	t_pwr = quantile(t_dist, power)
	multiplier = t_alpha .+ t_pwr

	if randomization_type == "individual"
	mdes = multiplier .* (sd_y / sqrt(sample_size .* (p_0 .* p_1)))
	elseif randomization_type == "block"
	mdes = multiplier .* ((1 .- R_squared) ./ sqrt(sample_size .* (p_0 .* p_1)))
	end
	return mdes
	end

	"""
	For each sample size, repeat process X times and take average power.
	Output starting sample size and reduced sample.
	"""
	function sim_power_function(HIT_size, effect_size, randomization_type, quantity = "power", mdes_power = nothing)
	if quantity == "power"
	average_power = mean([power_function(HIT_size, effect_size, 1000, 0.05, randomization_type)[1] for i in 1:100])
	filtered_sample_size = power_function(HIT_size, effect_size, 1000, 0.05, randomization_type)[2]
	cost = mturk_cost_function(HIT_size)
	out = [average_power, HIT_size, filtered_sample_size, effect_size, cost]
	elseif quantity == "mdes"
	mdes = mdes_function(effect_size, .05, mdes_power, filtered_sample_size, "block")
	cost = mturk_cost_function(HIT_size)
	out = [mdes, HIT_size, filtered_sample_size, mdes_power, effect_size, cost]
	end
	return out
	end
	end

	############## POWER #####################################
	alpha = 0.05
	sample_sizes = collect( range(100, stop = 5000, step = 100) )
	effect_sizes = collect( [0.1, 0.25, 0.65] )

	power_sims_ind = []
	for i in 1:length(effect_sizes)
	push!(power_sims_ind,
	@showprogress 1 "Pls be patient..." pmap(p -> sim_power_function(p, effect_sizes[i], "individual"), sample_sizes))
	end

	power_sims_block = []
	for i in 1:length(effect_sizes)
	push!(power_sims_block,
	@showprogress 1 "Pls be patient..." pmap(p -> sim_power_function(p, effect_sizes[i], "block"), sample_sizes))
	end
	############# MDES ######################################
	mdes_power = collect( range(0, stop = 1, step = .1) )
	mdes_sims_block = []
	for i in 1:length(mdes_power)
	push!(mdes_sims_block,
	@showprogress 1 "Pls be patient..." pmap(p -> sim_power_function(p, effect_sizes[i], "block"), sample_sizes))
	end

	############# STACK SIMS ######################################
	function power_in_df(data)
	df = DataFrame(data)
	df.variable_type = ["power", "initial_sample_size", "reduced_sample_size", "effect", "cost"]
	#reshape
	out = unstack(stack(df, 1:length(sample_sizes)), :variable_type, :value)
	# to categorical data
	out = categorical(out, :effect)
	return out
	end
	all_power_sims = vcat(power_in_df(power_sims_ind[1]),
	power_in_df(power_sims_ind[2]),
	power_in_df(power_sims_ind[3]))

	all_power_sims_block = vcat(power_in_df(power_sims_block[1]),
	power_in_df(power_sims_block[2]),
	power_in_df(power_sims_block[3]))
	print(all_power_sims)

	############# PLOT ######################################
	@rlibrary ggplot2
	ggplot(all_power_sims) +
	geom_line(aes(x = :reduced_sample_size, y = :power, color = :effect, group = :effect)) +
	theme_bw()

	ggplot(all_power_sims_block) +
	geom_line(aes(x = :reduced_sample_size, y = :power, color = :effect, group = :effect)) +
	theme_bw()


	#