johnmyleswhite/small_sample_bootstrap.jl

## small_sample_bootstrap.jl
import Distributions: LogNormal, Normal, cdf, cquantile

function coverage(cis, truth)
    s = 0
    n = length(cis)
    for ci in cis
        s += ci[1] <= truth <= ci[2]
    end
    return s / n
end

function normal_ci(x)
    m = mean(x)
    sem = std(x) / sqrt(length(x))
    c_l = quantile(Normal(0, 1), 0.05 / 2)
    c_u = cquantile(Normal(0, 1), 0.05 / 2)
    return m + c_l * sem, m + c_u * sem
end

function clt_bootstrap_ci(x, n_replicates)
    replicates = Array(Float64, n_replicates)
    x_resampled = similar(x)
    n = length(x)
    for replicate in 1:n_replicates
        for i in 1:n
            j = rand(1:n)
            x_resampled[i] = x[j]
        end
        replicates[replicate] = mean(x_resampled)
    end
    m, s = mean(replicates), std(replicates)
    l = quantile(Normal(m, s), 0.05 / 2)
    u = cquantile(Normal(m, s), 0.05 / 2)
    return l, u
end

function percentile_bootstrap_ci(x, n_replicates)
    replicates = Array(Float64, n_replicates)
    x_resampled = similar(x)
    n = length(x)
    for replicate in 1:n_replicates
        for i in 1:n
            j = rand(1:n)
            x_resampled[i] = x[j]
        end
        replicates[replicate] = mean(x_resampled)
    end
    l = quantile(replicates, 0.05 / 2)
    u = quantile(replicates, 1 - 0.05 / 2)
    return l, u
end

function simulate()
    src = LogNormal(0, 1)
    ns = 2:30
    n_sims = 100_000
    means = Array(Float64, n_sims)
    normal_cis = Array(Tuple{Float64, Float64}, n_sims)
    bootstrap_1000_cis = Array(Tuple{Float64, Float64}, n_sims)
    bootstrap_10000_cis = Array(Tuple{Float64, Float64}, n_sims)
    bootstrap_clt_cis = Array(Tuple{Float64, Float64}, n_sims)
    io = open("coverage.tsv", "w")
    @printf(io, "n\tcoverage\tmethod\n")
    for n in ns
        x = rand(src, n)
        for sim in 1:n_sims
            rand!(src, x)
            means[sim] = mean(x)
            normal_cis[sim] = normal_ci(x)
            bootstrap_1000_cis[sim] = percentile_bootstrap_ci(x, 1_000)
            bootstrap_10000_cis[sim] = percentile_bootstrap_ci(x, 10_000)
            bootstrap_clt_cis[sim] = clt_bootstrap_ci(x, 100)
        end
        empirical_mean = mean(means)
        true_mean = mean(src)
        empirical_sem = std(means)
        true_sem = sqrt(var(src) / n)
        @printf(
            io,
            "%d\t%s\t%s\n",
            n,
            coverage(normal_cis, true_mean),
            "CLT Normal",
        )
        @printf(
            io,
            "%d\t%s\t%s\n",
            n,
            coverage(bootstrap_1000_cis, true_mean),
            "Percentile Bootstrap w/ 1,000 Replicates",
        )
        @printf(
            io,
            "%d\t%s\t%s\n",
            n,
            coverage(bootstrap_10000_cis, true_mean),
            "Percentile Bootstrap w/ 10,000 Replicates",
        )
        @printf(
            io,
            "%d\t%s\t%s\n",
            n,
            coverage(bootstrap_clt_cis, true_mean),
            "CLT Bootstrap w/ 100 Replicates",
        )
        println(n)
    end
    close(io)
end

simulate()

## visualize_results.R
library("ggplot2")
library("reshape2")

coverage <- read.csv("coverage.tsv", sep = "\t")

ggplot(coverage, aes(x = n, y = coverage, color = method)) +
    geom_hline(yintercept = 0.95, alpha = 0.25) +
    geom_line() +
    xlab("N") +
    ylab("Empirical Coverage Probability") +
    ggtitle("Anti-Conservative Coverage Probabilities for Small N") +
    ylim(0.4, 1) +
    theme_bw()
ggsave("coverage.png", height = 8, width = 12)
	import Distributions: LogNormal, Normal, cdf, cquantile

	function coverage(cis, truth)
	s = 0
	n = length(cis)
	for ci in cis
	s += ci[1] <= truth <= ci[2]
	end
	return s / n
	end

	function normal_ci(x)
	m = mean(x)
	sem = std(x) / sqrt(length(x))
	c_l = quantile(Normal(0, 1), 0.05 / 2)
	c_u = cquantile(Normal(0, 1), 0.05 / 2)
	return m + c_l * sem, m + c_u * sem
	end

	function clt_bootstrap_ci(x, n_replicates)
	replicates = Array(Float64, n_replicates)
	x_resampled = similar(x)
	n = length(x)
	for replicate in 1:n_replicates
	for i in 1:n
	j = rand(1:n)
	x_resampled[i] = x[j]
	end
	replicates[replicate] = mean(x_resampled)
	end
	m, s = mean(replicates), std(replicates)
	l = quantile(Normal(m, s), 0.05 / 2)
	u = cquantile(Normal(m, s), 0.05 / 2)
	return l, u
	end

	function percentile_bootstrap_ci(x, n_replicates)
	replicates = Array(Float64, n_replicates)
	x_resampled = similar(x)
	n = length(x)
	for replicate in 1:n_replicates
	for i in 1:n
	j = rand(1:n)
	x_resampled[i] = x[j]
	end
	replicates[replicate] = mean(x_resampled)
	end
	l = quantile(replicates, 0.05 / 2)
	u = quantile(replicates, 1 - 0.05 / 2)
	return l, u
	end

	function simulate()
	src = LogNormal(0, 1)
	ns = 2:30
	n_sims = 100_000
	means = Array(Float64, n_sims)
	normal_cis = Array(Tuple{Float64, Float64}, n_sims)
	bootstrap_1000_cis = Array(Tuple{Float64, Float64}, n_sims)
	bootstrap_10000_cis = Array(Tuple{Float64, Float64}, n_sims)
	bootstrap_clt_cis = Array(Tuple{Float64, Float64}, n_sims)
	io = open("coverage.tsv", "w")
	@printf(io, "n\tcoverage\tmethod\n")
	for n in ns
	x = rand(src, n)
	for sim in 1:n_sims
	rand!(src, x)
	means[sim] = mean(x)
	normal_cis[sim] = normal_ci(x)
	bootstrap_1000_cis[sim] = percentile_bootstrap_ci(x, 1_000)
	bootstrap_10000_cis[sim] = percentile_bootstrap_ci(x, 10_000)
	bootstrap_clt_cis[sim] = clt_bootstrap_ci(x, 100)
	end
	empirical_mean = mean(means)
	true_mean = mean(src)
	empirical_sem = std(means)
	true_sem = sqrt(var(src) / n)
	@printf(
	io,
	"%d\t%s\t%s\n",
	n,
	coverage(normal_cis, true_mean),
	"CLT Normal",
	)
	@printf(
	io,
	"%d\t%s\t%s\n",
	n,
	coverage(bootstrap_1000_cis, true_mean),
	"Percentile Bootstrap w/ 1,000 Replicates",
	)
	@printf(
	io,
	"%d\t%s\t%s\n",
	n,
	coverage(bootstrap_10000_cis, true_mean),
	"Percentile Bootstrap w/ 10,000 Replicates",
	)
	@printf(
	io,
	"%d\t%s\t%s\n",
	n,
	coverage(bootstrap_clt_cis, true_mean),
	"CLT Bootstrap w/ 100 Replicates",
	)
	println(n)
	end
	close(io)
	end

	simulate()
	library("ggplot2")
	library("reshape2")

	coverage <- read.csv("coverage.tsv", sep = "\t")

	ggplot(coverage, aes(x = n, y = coverage, color = method)) +
	geom_hline(yintercept = 0.95, alpha = 0.25) +
	geom_line() +
	xlab("N") +
	ylab("Empirical Coverage Probability") +
	ggtitle("Anti-Conservative Coverage Probabilities for Small N") +
	ylim(0.4, 1) +
	theme_bw()
	ggsave("coverage.png", height = 8, width = 12)