johnmyleswhite/sequential.jl

## sequential.jl
# Ground rules:
#
# (1) Must collect at least 10 observations
# (2) After that, keep collecting observations until
#     we satisfy condition: s / sqrt(n) < tolerance
# (3) Then output a tuple of s and n.

using Distributions

function run_simulation()
    # Run enough simulations to have a reasonable sense of joint P(s, n)
    n_sims = 100_000

    # Per-simulation estimates of standard deviation.
    ss = Array(Float64, n_sims)

    # Per-simulation estimates of sample size required.
    ns = Array(Int, n_sims)

    # Each individual observation is a draw from this distribution.
    dgp = Normal(10, 1)

    # We stop when the SEM is smaller than this tolerance.
    tolerance = 0.10

    # Or we stop when we collect our full budget of samples.
    max_n = 1_000

    # Monte Carlo loop
    for sim in 1:n_sims
        # Collect initial 10 observations.
        x = rand(dgp, 10)

        # Estimate initial parameters.
        s = std(x)
        n = 10
        sem = s / sqrt(n)

        # Keep going until we hit tolerance or run out of budget.
        while sem > tolerance && n < max_n
            # Add one more observation.
            push!(x, rand(dgp))

            # Update parameter estimates.
            s = std(x)
            n += 1
            sem = s / sqrt(n)
        end

        # Store final outcome as a single draw from joint distribution.
        ss[sim] = s
        ns[sim] = n
    end

    return ss, ns
end

ss, ns = run_simulation()

# Write data to disk for later analysis in R.
writecsv("sequential.csv", hcat(ss, ns))

## viz.R
library("ggplot2")
library("dplyr")

tuples <- read.csv("sequential.csv", header = FALSE)

names(tuples) <- c("s", "n")

ggplot(tuples, aes(x = n, y = s)) +
    geom_point()

biases <- tuples %>%
    group_by(n) %>%
    summarize(bias = mean(s - 1))

ggplot(biases, aes(x = n, y = bias)) +
    geom_line()

tuples %>%
    summarize(
        bias = mean(s - 1)
    )

t.test(tuples$s - 1)
	# Ground rules:
	#
	# (1) Must collect at least 10 observations
	# (2) After that, keep collecting observations until
	# we satisfy condition: s / sqrt(n) < tolerance
	# (3) Then output a tuple of s and n.

	using Distributions

	function run_simulation()
	# Run enough simulations to have a reasonable sense of joint P(s, n)
	n_sims = 100_000

	# Per-simulation estimates of standard deviation.
	ss = Array(Float64, n_sims)

	# Per-simulation estimates of sample size required.
	ns = Array(Int, n_sims)

	# Each individual observation is a draw from this distribution.
	dgp = Normal(10, 1)

	# We stop when the SEM is smaller than this tolerance.
	tolerance = 0.10

	# Or we stop when we collect our full budget of samples.
	max_n = 1_000

	# Monte Carlo loop
	for sim in 1:n_sims
	# Collect initial 10 observations.
	x = rand(dgp, 10)

	# Estimate initial parameters.
	s = std(x)
	n = 10
	sem = s / sqrt(n)

	# Keep going until we hit tolerance or run out of budget.
	while sem > tolerance && n < max_n
	# Add one more observation.
	push!(x, rand(dgp))

	# Update parameter estimates.
	s = std(x)
	n += 1
	sem = s / sqrt(n)
	end

	# Store final outcome as a single draw from joint distribution.
	ss[sim] = s
	ns[sim] = n
	end

	return ss, ns
	end

	ss, ns = run_simulation()

	# Write data to disk for later analysis in R.
	writecsv("sequential.csv", hcat(ss, ns))
	library("ggplot2")
	library("dplyr")

	tuples <- read.csv("sequential.csv", header = FALSE)

	names(tuples) <- c("s", "n")

	ggplot(tuples, aes(x = n, y = s)) +
	geom_point()

	biases <- tuples %>%
	group_by(n) %>%
	summarize(bias = mean(s - 1))

	ggplot(biases, aes(x = n, y = bias)) +
	geom_line()

	tuples %>%
	summarize(
	bias = mean(s - 1)
	)

	t.test(tuples$s - 1)