fdabl/comparison.R

## comparison.R
library('coda')
library('BayesFactor')
set.seed(1774) # Laplace easter egg :)


contains_zero <- function(interval) {
    return(interval[1] < 0 && interval[2] > 0)
}


sim <- function(diff = 0, n = 500) {
    m <- 100
    sd <- 15
    x <- rnorm(n, mean = m, sd = sd)
    y <- rnorm(n, mean = m + diff, sd = sd)

    # delta will be negative!
    freq_test <- t.test(x, y)
    bayes_test <- ttestBF(x, y)

    bf <- as.vector(bayes_test)
    samples <- posterior(bayes_test, iterations = 1000, progress = FALSE)
    cred_interval <- as.vector(HPDinterval(samples)[4, ]) # CI around delta, the difference

    return(list('bf' = bf, 'cred_interval' = cred_interval,
                'p' = freq_test$p.value, 'conf_interval' = freq_test$conf.int))
}


run <- function(diff = 0, n = 500, times = 2000, bf_cutoff = 3, alpha = .05) {
    p_err <- 0
    bf_err <- 0
    cred_err <- 0
    conf_err <- 0
    freq_contr <- 0
    bayes_contr <- 0

    for (i in 1:times) {
        simulation <- sim(diff, n)
        p <- simulation$p
        bf <- simulation$bf
        cred0 <- contains_zero(simulation$cred_interval)
        conf0 <- contains_zero(simulation$conf_interval)

        # if there is no difference, check if a false positive occurred [bf >= cut-off; cred0 == FALSE]
        # if there is a difference, check if a false negative occured [bf < cut-off; cred0 == TRUE]

        p_erred <- if (diff == 0) p <= alpha else p > alpha
        bf_erred <- if (diff == 0) bf >= bf_cutoff else bf < bf_cutoff
        cred_erred <- if (diff == 0) !cred0 else cred0
        conf_erred <- if (diff == 0) !conf0 else conf0

        # assess errors
        if (p_erred) p_err <- p_err + 1
        if (bf_erred) bf_err <- bf_err + 1
        if (cred_erred) cred_err <- cred_err + 1
        if (conf_erred) conf_err <- conf_err + 1

        # assess contradicting results
        if (p_erred != conf_erred)
            freq_contr <- freq_contr + 1
        if (bf_erred != cred_erred)
            bayes_contr <- bayes_contr + 1
    }

    res <-list('bf' = bf_err, 'cred' = cred_err, 'p' = p_err, 'conf' = conf_err,
               'bayes_contr' = bayes_contr, 'freq_contr' = freq_contr)
    return(lapply(res, function(e) e / times))
}

# RESULTS:
# bayes: crebility intervals seem to be biased towards H1 (bayes factors not); that's because
# they don't control the type I error rate
# freq: p-values and confidence intervals reach equivalent conclusions (in this case, trivially so)

# the simulations compare the result of model comparison via Bayes Factors / p-values and
# interval estimation; A Bayes Factor >= 3 is taken as support for H1, as suggested by Jeffreys
# a p-value <= .05 is taken as *support* for H1, as suggested by Fisher
# When a specific interval does not include 0, this is taken as support for H1

# n = 500, d = 2/3 ... in frequentist statistics that would amount to power of 100%
alpha <- run() # check false positives
# beta <- run(diff = 10) # check false negatives // since we're in the large sample, always makes the right decision


# N = 500
# true difference (alpha errors):
# bf ~ .5%, cred ~ 5.1%, p ~ 5.15%, conf ~ 5.15%
# http://www.ejwagenmakers.com/submitted/AnotherStatisticalParadox.pdf argue that interval
# estimation biases in favour of H1; we see this in the case of credibility intervals,
# but not in terms of confidence intervals (the latter case - that p values and confidence intervals
# are formally equivalent, is mentioned in the paper)

# contradictions (BF / p-value differ from CIs):
# bayes ~ 4.6%, freq ~ 0%

# no difference: no beta errors (in the large samples, even p-values are consistent)

# n = 37, d = 2/3 ... frequentist power is 80%; pwr.t.test(n = 37, d = 2/3) using library('pwr')
alpha2 <- run(n = 37)
beta2 <- run(diff = 10, n = 37)

# N = 37
# true difference (alpha errors):
# bf ~ 2%, cred ~ 4.55%, p ~ 5.65%, conf ~ 5.65%
# sort of a similar pattern emerges as with 100% power
# difference between bayes factor and credibility interval is not that high, though
# contradictions: bayes ~ 2.55%, freq ~ 0%

# no difference (beta errors):
# bf ~ 33%, cred ~ 22.45%, p ~ 18.1%, conf ~ 18.1%
# in bayesian statistics, uncertainty *smacks you in the face*; so I'm not sure how meaningful this test is
# contradictions: bayes ~ 10.55%, freq ~ 0%


# NOTE:
# p-values *suck*, see http://www.ejwagenmakers.com/2008/BayesFreqBook.pdf
# Here we use the Bayes Factor comparison in a rather mindless way (Gigerenzer, 2004); it should be noted
# that Jeffreys described a Bayes Factor as high as 5.33 as "odds that would interest a gambler, but would be hardly worth
# more than a passing mention in a scientific paper." (quoted from http://www.ejwagenmakers.com/inpress/VandekerckhoveEtAlinpress.pdf)
	library('coda')
	library('BayesFactor')
	set.seed(1774) # Laplace easter egg :)


	contains_zero <- function(interval) {
	return(interval[1] < 0 && interval[2] > 0)
	}


	sim <- function(diff = 0, n = 500) {
	m <- 100
	sd <- 15
	x <- rnorm(n, mean = m, sd = sd)
	y <- rnorm(n, mean = m + diff, sd = sd)

	# delta will be negative!
	freq_test <- t.test(x, y)
	bayes_test <- ttestBF(x, y)

	bf <- as.vector(bayes_test)
	samples <- posterior(bayes_test, iterations = 1000, progress = FALSE)
	cred_interval <- as.vector(HPDinterval(samples)[4, ]) # CI around delta, the difference

	return(list('bf' = bf, 'cred_interval' = cred_interval,
	'p' = freq_test$p.value, 'conf_interval' = freq_test$conf.int))
	}


	run <- function(diff = 0, n = 500, times = 2000, bf_cutoff = 3, alpha = .05) {
	p_err <- 0
	bf_err <- 0
	cred_err <- 0
	conf_err <- 0
	freq_contr <- 0
	bayes_contr <- 0

	for (i in 1:times) {
	simulation <- sim(diff, n)
	p <- simulation$p
	bf <- simulation$bf
	cred0 <- contains_zero(simulation$cred_interval)
	conf0 <- contains_zero(simulation$conf_interval)

	# if there is no difference, check if a false positive occurred [bf >= cut-off; cred0 == FALSE]
	# if there is a difference, check if a false negative occured [bf < cut-off; cred0 == TRUE]

	p_erred <- if (diff == 0) p <= alpha else p > alpha
	bf_erred <- if (diff == 0) bf >= bf_cutoff else bf < bf_cutoff
	cred_erred <- if (diff == 0) !cred0 else cred0
	conf_erred <- if (diff == 0) !conf0 else conf0

	# assess errors
	if (p_erred) p_err <- p_err + 1
	if (bf_erred) bf_err <- bf_err + 1
	if (cred_erred) cred_err <- cred_err + 1
	if (conf_erred) conf_err <- conf_err + 1

	# assess contradicting results
	if (p_erred != conf_erred)
	freq_contr <- freq_contr + 1
	if (bf_erred != cred_erred)
	bayes_contr <- bayes_contr + 1
	}

	res <-list('bf' = bf_err, 'cred' = cred_err, 'p' = p_err, 'conf' = conf_err,
	'bayes_contr' = bayes_contr, 'freq_contr' = freq_contr)
	return(lapply(res, function(e) e / times))
	}

	# RESULTS:
	# bayes: crebility intervals seem to be biased towards H1 (bayes factors not); that's because
	# they don't control the type I error rate
	# freq: p-values and confidence intervals reach equivalent conclusions (in this case, trivially so)

	# the simulations compare the result of model comparison via Bayes Factors / p-values and
	# interval estimation; A Bayes Factor >= 3 is taken as support for H1, as suggested by Jeffreys
	# a p-value <= .05 is taken as support for H1, as suggested by Fisher
	# When a specific interval does not include 0, this is taken as support for H1

	# n = 500, d = 2/3 ... in frequentist statistics that would amount to power of 100%
	alpha <- run() # check false positives
	# beta <- run(diff = 10) # check false negatives // since we're in the large sample, always makes the right decision


	# N = 500
	# true difference (alpha errors):
	# bf ~ .5%, cred ~ 5.1%, p ~ 5.15%, conf ~ 5.15%
	# http://www.ejwagenmakers.com/submitted/AnotherStatisticalParadox.pdf argue that interval
	# estimation biases in favour of H1; we see this in the case of credibility intervals,
	# but not in terms of confidence intervals (the latter case - that p values and confidence intervals
	# are formally equivalent, is mentioned in the paper)

	# contradictions (BF / p-value differ from CIs):
	# bayes ~ 4.6%, freq ~ 0%

	# no difference: no beta errors (in the large samples, even p-values are consistent)

	# n = 37, d = 2/3 ... frequentist power is 80%; pwr.t.test(n = 37, d = 2/3) using library('pwr')
	alpha2 <- run(n = 37)
	beta2 <- run(diff = 10, n = 37)

	# N = 37
	# true difference (alpha errors):
	# bf ~ 2%, cred ~ 4.55%, p ~ 5.65%, conf ~ 5.65%
	# sort of a similar pattern emerges as with 100% power
	# difference between bayes factor and credibility interval is not that high, though
	# contradictions: bayes ~ 2.55%, freq ~ 0%

	# no difference (beta errors):
	# bf ~ 33%, cred ~ 22.45%, p ~ 18.1%, conf ~ 18.1%
	# in bayesian statistics, uncertainty smacks you in the face; so I'm not sure how meaningful this test is
	# contradictions: bayes ~ 10.55%, freq ~ 0%


	# NOTE:
	# p-values suck, see http://www.ejwagenmakers.com/2008/BayesFreqBook.pdf
	# Here we use the Bayes Factor comparison in a rather mindless way (Gigerenzer, 2004); it should be noted
	# that Jeffreys described a Bayes Factor as high as 5.33 as "odds that would interest a gambler, but would be hardly worth
	# more than a passing mention in a scientific paper." (quoted from http://www.ejwagenmakers.com/inpress/VandekerckhoveEtAlinpress.pdf)