nestordemeure/pval.R

## pval.R
library(magrittr)
library(pwr)

#------------------------------------------------------------------------------
# probability of error computation

# estimate beta for a t-test with inequal nb_ko and nb_wt
# see https://www.statmethods.net/stats/power.html
# use formulas from Cohen 1988 http://www.utstat.toronto.edu/~brunner/oldclass/378f16/readings/CohenPower.pdf
compute_beta <- function(alpha, nb_ko, nb_wt, effect_size)
{
   # run the function only if it is defined
   if ((nb_ko < 2) || (nb_wt < 2))
   {
      return(NA)
   }

   # use the appropriate ttest if there are as many ko as wt
   if (nb_ko == nb_wt)
   {
      # we suppose that a paired test if done if there are as many ko as wt
      power <- pwr.t.test(n = nb_ko, d = effect_size, sig.level = alpha, type="paired")$power
   }
   else
   {
      power <- pwr.t2n.test(n1 = nb_ko, n2 = nb_wt, d = effect_size, sig.level = alpha)$power
   }

   beta <- 1 - power
   return(beta)
}
compute_beta <- Vectorize(compute_beta)

# function to minimize in order to minimize the sum of risk alpha and beta
# computed for a single test
probability_of_error <- function(alpha, nb_ko, nb_wt, effect_size, proba_H1=0.5)
{
   beta <- compute_beta(alpha, nb_ko, nb_wt, effect_size)
   proba_H0 <- 1 - proba_H1
   risk <- proba_H0*alpha + proba_H1*beta
   return(risk)
}

#------------------------------------------------------------------------------
# P-Value Selection

# finds the largest p-value that has a probability of error within a relativ error, relativ_tolerance, of proba_H1
# under the hypothesis that proba_H1 is optimal
# meaning that the optimized function has no distinct minimum and is, instead, flat around 0
backup_pval <- function(nb_ko, nb_wt, effect_size, proba_H1 = 0.5, min_pval=1e-10, relativ_tolerance=5e-3)
{
   # target error probability
   target_error <- proba_H1*(1 + relativ_tolerance)

   # does the optimization in log scale to get finer around 0 without losing time elsewhere
   inf <- min_pval # > 0
   sup <- 1
   tol <- 0.1 # numerical tolerance around sup (which will be scaled for inf)
   inf_log <- log(inf)
   sup_log <- log(sup)
   tol_log <- log(1 + tol/sup)

   # function that reaches zero when the loss reaches target_error
   shifted_loss_log <- function(pval_log) probability_of_error(exp(pval_log), nb_ko, nb_wt, effect_size, proba_H1) - target_error

   # finds the zero of the function
   root <- uniroot(shifted_loss_log, interval = c(inf_log,sup_log), tol = tol_log)$root %>% exp()

   return(root)
}

# computes the pvalue that minimizes the sum of risks for a given problem
#
# if it is passed an array of parameters, it will consider them as part of the same test
# and return a single p-value under the assumption that if at least one experiment returns true then the overall test returns true
#
# see 'Setting an Optimal α That Minimizes Errors in Null Hypothesis Significance Tests'
# https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0032734
optimum_pval <- function(nb_ko, nb_wt, effect_size, proba_H1 = 0.5, min_pval=1e-10)
{
   # does the optimization in log scale to get finer around 0 without losing time elsewhere
   inf <- min_pval # > 0
   sup <- 1
   tol <- 0.1 # numerical tolerance around sup (which will be scaled for inf)
   inf_log <- log(inf)
   sup_log <- log(sup)
   tol_log <- log(1 + tol/sup)

   # probability of making an error when using a pvalue as a threshold
   loss_log <- function(pval_log) probability_of_error(exp(pval_log), nb_ko, nb_wt, effect_size, proba_H1)

   # finds the minimum of the loss function
   solution <- optimize(loss_log, interval = c(inf_log,sup_log), maximum = FALSE, tol = tol_log)

   # get the solution
   min_error <- solution$objective
   best_pvalue <- exp(solution$minimum)

   if( (best_pvalue < 2*min_pval) && (min_error >= proba_H1) )
   {
      # the error function is flat around 0 with no distinct minimum
      warning("It appears the probability of error as a function of the p-value does ot have a distinct minimum.\nIt probably means that your experiment does not have enough power / samples.\nThe algorithm will return the largest p-value that is within 1/1000 of the optimal error probability.")

      # we return the largest p-value with an error probability within less than 1% of the optimal
      pvalue <- backup_pval(nb_ko, nb_wt, effect_size, proba_H1, min_pval, relativ_tolerance=0.005)

      return(pvalue)
   }
   else
   {
      # it is a true minimum of the error function
      return(best_pvalue)
   }
}

#------------------------------------------------------------------------------
# PLOT

# plots the probability of making an error as a function of the pvalue
# and the optimal choice of p-value as a red line
plot_error <- function(nb_ko, nb_wt, effect_size, proba_H1=0.5, min_pval=1e-5)
{
   pvalues <- seq(from=log(min_pval), to=0, by=0.1) %>% exp()
   errors <- sapply(pvalues, function(pval) probability_of_error(pval, nb_ko, nb_wt, effect_size, proba_H1))
   best_pval <- optimum_pval(nb_ko, nb_wt, effect_size, proba_H1)

   title <- paste("Probability of error as a function of the p-value", "(optimum:", sprintf("%.2e", best_pval), ")")

   X11()
   ggplot(NULL, aes(x=pvalues, y=errors)) +
      ggtitle(title) +
      labs(x = "p-value", y="probability of error") +
      scale_x_log10() + #scale_y_log10() +
      geom_line() +
      geom_vline(xintercept = best_pval, linetype="dashed", color = "red")
}

#------------------------------------------------------------------------------
# TEST

nb_wt <- 5 # number of controls
nb_ko <- 5 # number of non controls
effect_size <- 0.2 # small effect, 0.5 would be medium and 0.8 large
plot_error(nb_ko, nb_wt, effect_size) # displays the plot of error associated
	library(magrittr)
	library(pwr)

	#------------------------------------------------------------------------------
	# probability of error computation

	# estimate beta for a t-test with inequal nb_ko and nb_wt
	# see https://www.statmethods.net/stats/power.html
	# use formulas from Cohen 1988 http://www.utstat.toronto.edu/~brunner/oldclass/378f16/readings/CohenPower.pdf
	compute_beta <- function(alpha, nb_ko, nb_wt, effect_size)
	{
	# run the function only if it is defined
	if ((nb_ko < 2) \|\| (nb_wt < 2))
	{
	return(NA)
	}

	# use the appropriate ttest if there are as many ko as wt
	if (nb_ko == nb_wt)
	{
	# we suppose that a paired test if done if there are as many ko as wt
	power <- pwr.t.test(n = nb_ko, d = effect_size, sig.level = alpha, type="paired")$power
	}
	else
	{
	power <- pwr.t2n.test(n1 = nb_ko, n2 = nb_wt, d = effect_size, sig.level = alpha)$power
	}

	beta <- 1 - power
	return(beta)
	}
	compute_beta <- Vectorize(compute_beta)

	# function to minimize in order to minimize the sum of risk alpha and beta
	# computed for a single test
	probability_of_error <- function(alpha, nb_ko, nb_wt, effect_size, proba_H1=0.5)
	{
	beta <- compute_beta(alpha, nb_ko, nb_wt, effect_size)
	proba_H0 <- 1 - proba_H1
	risk <- proba_H0alpha + proba_H1beta
	return(risk)
	}

	#------------------------------------------------------------------------------
	# P-Value Selection

	# finds the largest p-value that has a probability of error within a relativ error, relativ_tolerance, of proba_H1
	# under the hypothesis that proba_H1 is optimal
	# meaning that the optimized function has no distinct minimum and is, instead, flat around 0
	backup_pval <- function(nb_ko, nb_wt, effect_size, proba_H1 = 0.5, min_pval=1e-10, relativ_tolerance=5e-3)
	{
	# target error probability
	target_error <- proba_H1*(1 + relativ_tolerance)

	# does the optimization in log scale to get finer around 0 without losing time elsewhere
	inf <- min_pval # > 0
	sup <- 1
	tol <- 0.1 # numerical tolerance around sup (which will be scaled for inf)
	inf_log <- log(inf)
	sup_log <- log(sup)
	tol_log <- log(1 + tol/sup)

	# function that reaches zero when the loss reaches target_error
	shifted_loss_log <- function(pval_log) probability_of_error(exp(pval_log), nb_ko, nb_wt, effect_size, proba_H1) - target_error

	# finds the zero of the function
	root <- uniroot(shifted_loss_log, interval = c(inf_log,sup_log), tol = tol_log)$root %>% exp()

	return(root)
	}

	# computes the pvalue that minimizes the sum of risks for a given problem
	#
	# if it is passed an array of parameters, it will consider them as part of the same test
	# and return a single p-value under the assumption that if at least one experiment returns true then the overall test returns true
	#
	# see 'Setting an Optimal α That Minimizes Errors in Null Hypothesis Significance Tests'
	# https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0032734
	optimum_pval <- function(nb_ko, nb_wt, effect_size, proba_H1 = 0.5, min_pval=1e-10)
	{
	# does the optimization in log scale to get finer around 0 without losing time elsewhere
	inf <- min_pval # > 0
	sup <- 1
	tol <- 0.1 # numerical tolerance around sup (which will be scaled for inf)
	inf_log <- log(inf)
	sup_log <- log(sup)
	tol_log <- log(1 + tol/sup)

	# probability of making an error when using a pvalue as a threshold
	loss_log <- function(pval_log) probability_of_error(exp(pval_log), nb_ko, nb_wt, effect_size, proba_H1)

	# finds the minimum of the loss function
	solution <- optimize(loss_log, interval = c(inf_log,sup_log), maximum = FALSE, tol = tol_log)

	# get the solution
	min_error <- solution$objective
	best_pvalue <- exp(solution$minimum)

	if( (best_pvalue < 2*min_pval) && (min_error >= proba_H1) )
	{
	# the error function is flat around 0 with no distinct minimum
	warning("It appears the probability of error as a function of the p-value does ot have a distinct minimum.\nIt probably means that your experiment does not have enough power / samples.\nThe algorithm will return the largest p-value that is within 1/1000 of the optimal error probability.")

	# we return the largest p-value with an error probability within less than 1% of the optimal
	pvalue <- backup_pval(nb_ko, nb_wt, effect_size, proba_H1, min_pval, relativ_tolerance=0.005)

	return(pvalue)
	}
	else
	{
	# it is a true minimum of the error function
	return(best_pvalue)
	}
	}

	#------------------------------------------------------------------------------
	# PLOT

	# plots the probability of making an error as a function of the pvalue
	# and the optimal choice of p-value as a red line
	plot_error <- function(nb_ko, nb_wt, effect_size, proba_H1=0.5, min_pval=1e-5)
	{
	pvalues <- seq(from=log(min_pval), to=0, by=0.1) %>% exp()
	errors <- sapply(pvalues, function(pval) probability_of_error(pval, nb_ko, nb_wt, effect_size, proba_H1))
	best_pval <- optimum_pval(nb_ko, nb_wt, effect_size, proba_H1)

	title <- paste("Probability of error as a function of the p-value", "(optimum:", sprintf("%.2e", best_pval), ")")

	X11()
	ggplot(NULL, aes(x=pvalues, y=errors)) +
	ggtitle(title) +
	labs(x = "p-value", y="probability of error") +
	scale_x_log10() + #scale_y_log10() +
	geom_line() +
	geom_vline(xintercept = best_pval, linetype="dashed", color = "red")
	}

	#------------------------------------------------------------------------------
	# TEST

	nb_wt <- 5 # number of controls
	nb_ko <- 5 # number of non controls
	effect_size <- 0.2 # small effect, 0.5 would be medium and 0.8 large
	plot_error(nb_ko, nb_wt, effect_size) # displays the plot of error associated