Skip to content

Instantly share code, notes, and snippets.

@nestordemeure
Created August 8, 2020 12:48
Show Gist options
  • Save nestordemeure/7c02143a1dc55373a4a137fe4d95d507 to your computer and use it in GitHub Desktop.
Save nestordemeure/7c02143a1dc55373a4a137fe4d95d507 to your computer and use it in GitHub Desktop.
Finding the optimal pvalue for a t-test
library(magrittr)
library(pwr)
#------------------------------------------------------------------------------
# probability of error computation
# estimate beta for a t-test with inequal nb_ko and nb_wt
# see https://www.statmethods.net/stats/power.html
# use formulas from Cohen 1988 http://www.utstat.toronto.edu/~brunner/oldclass/378f16/readings/CohenPower.pdf
compute_beta <- function(alpha, nb_ko, nb_wt, effect_size)
{
# run the function only if it is defined
if ((nb_ko < 2) || (nb_wt < 2))
{
return(NA)
}
# use the appropriate ttest if there are as many ko as wt
if (nb_ko == nb_wt)
{
# we suppose that a paired test if done if there are as many ko as wt
power <- pwr.t.test(n = nb_ko, d = effect_size, sig.level = alpha, type="paired")$power
}
else
{
power <- pwr.t2n.test(n1 = nb_ko, n2 = nb_wt, d = effect_size, sig.level = alpha)$power
}
beta <- 1 - power
return(beta)
}
compute_beta <- Vectorize(compute_beta)
# function to minimize in order to minimize the sum of risk alpha and beta
# computed for a single test
probability_of_error <- function(alpha, nb_ko, nb_wt, effect_size, proba_H1=0.5)
{
beta <- compute_beta(alpha, nb_ko, nb_wt, effect_size)
proba_H0 <- 1 - proba_H1
risk <- proba_H0*alpha + proba_H1*beta
return(risk)
}
#------------------------------------------------------------------------------
# P-Value Selection
# finds the largest p-value that has a probability of error within a relativ error, relativ_tolerance, of proba_H1
# under the hypothesis that proba_H1 is optimal
# meaning that the optimized function has no distinct minimum and is, instead, flat around 0
backup_pval <- function(nb_ko, nb_wt, effect_size, proba_H1 = 0.5, min_pval=1e-10, relativ_tolerance=5e-3)
{
# target error probability
target_error <- proba_H1*(1 + relativ_tolerance)
# does the optimization in log scale to get finer around 0 without losing time elsewhere
inf <- min_pval # > 0
sup <- 1
tol <- 0.1 # numerical tolerance around sup (which will be scaled for inf)
inf_log <- log(inf)
sup_log <- log(sup)
tol_log <- log(1 + tol/sup)
# function that reaches zero when the loss reaches target_error
shifted_loss_log <- function(pval_log) probability_of_error(exp(pval_log), nb_ko, nb_wt, effect_size, proba_H1) - target_error
# finds the zero of the function
root <- uniroot(shifted_loss_log, interval = c(inf_log,sup_log), tol = tol_log)$root %>% exp()
return(root)
}
# computes the pvalue that minimizes the sum of risks for a given problem
#
# if it is passed an array of parameters, it will consider them as part of the same test
# and return a single p-value under the assumption that if at least one experiment returns true then the overall test returns true
#
# see 'Setting an Optimal α That Minimizes Errors in Null Hypothesis Significance Tests'
# https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0032734
optimum_pval <- function(nb_ko, nb_wt, effect_size, proba_H1 = 0.5, min_pval=1e-10)
{
# does the optimization in log scale to get finer around 0 without losing time elsewhere
inf <- min_pval # > 0
sup <- 1
tol <- 0.1 # numerical tolerance around sup (which will be scaled for inf)
inf_log <- log(inf)
sup_log <- log(sup)
tol_log <- log(1 + tol/sup)
# probability of making an error when using a pvalue as a threshold
loss_log <- function(pval_log) probability_of_error(exp(pval_log), nb_ko, nb_wt, effect_size, proba_H1)
# finds the minimum of the loss function
solution <- optimize(loss_log, interval = c(inf_log,sup_log), maximum = FALSE, tol = tol_log)
# get the solution
min_error <- solution$objective
best_pvalue <- exp(solution$minimum)
if( (best_pvalue < 2*min_pval) && (min_error >= proba_H1) )
{
# the error function is flat around 0 with no distinct minimum
warning("It appears the probability of error as a function of the p-value does ot have a distinct minimum.\nIt probably means that your experiment does not have enough power / samples.\nThe algorithm will return the largest p-value that is within 1/1000 of the optimal error probability.")
# we return the largest p-value with an error probability within less than 1% of the optimal
pvalue <- backup_pval(nb_ko, nb_wt, effect_size, proba_H1, min_pval, relativ_tolerance=0.005)
return(pvalue)
}
else
{
# it is a true minimum of the error function
return(best_pvalue)
}
}
#------------------------------------------------------------------------------
# PLOT
# plots the probability of making an error as a function of the pvalue
# and the optimal choice of p-value as a red line
plot_error <- function(nb_ko, nb_wt, effect_size, proba_H1=0.5, min_pval=1e-5)
{
pvalues <- seq(from=log(min_pval), to=0, by=0.1) %>% exp()
errors <- sapply(pvalues, function(pval) probability_of_error(pval, nb_ko, nb_wt, effect_size, proba_H1))
best_pval <- optimum_pval(nb_ko, nb_wt, effect_size, proba_H1)
title <- paste("Probability of error as a function of the p-value", "(optimum:", sprintf("%.2e", best_pval), ")")
X11()
ggplot(NULL, aes(x=pvalues, y=errors)) +
ggtitle(title) +
labs(x = "p-value", y="probability of error") +
scale_x_log10() + #scale_y_log10() +
geom_line() +
geom_vline(xintercept = best_pval, linetype="dashed", color = "red")
}
#------------------------------------------------------------------------------
# TEST
nb_wt <- 5 # number of controls
nb_ko <- 5 # number of non controls
effect_size <- 0.2 # small effect, 0.5 would be medium and 0.8 large
plot_error(nb_ko, nb_wt, effect_size) # displays the plot of error associated
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment