m-Py/SIX_OUT_OF_THIRTY.R

## SIX_OUT_OF_THIRTY.R
## Warning: This code is just for fun / educational purposes; the file contains functions
## to find out how severely the p value in a t-test can be minimized by systematic removal of data points.

## SIX OUT OF THIRTY - Martin's approach
## Based on @juli_tkotz's (https://twitter.com/juli_tkotz/status/1085446224117985281)
## idea that removing from the most extreme values is the best apporach.


#' Simulate t-tests and store best p values
#'
#' nsim = How many iterations in the simulation
#'
#' return: For each run, the best p value that could be obtained by
#'   removing 6 cases from the sample
#'
ttest_sim <- function(nsim) {
  # prepare matrix to be filled with values of t-test
  p_values <- rep(NA, nsim)
  # p value, t value, effect size
  for (i in 1:nsim) {
    # both groups: 30 values drawn from a normal distribution, mean = 0, SD = 1
    dat1 <- rnorm(30)
    dat2 <- rnorm(30)
    p_values[i] <- get_best_pvalue(dat1, dat2, remove = 6)
  }
  return(p_values)
}

#' Lowest p value for the comparison of two group means with fixed number
#' of observations removed
#'
#' dat1 = vector of numeric values for group 1
#' dat2 = vector of numeric values for group 2
#' remove = how many observations have to be removed
#'
#' returns: The best possible p value for the comparison of the two
#'   groups when cases are removed
#'
get_best_pvalue <- function(dat1, dat2, remove) {
  ## Initialize best p value:
  best_p <- Inf
  ## Generate data frame containing all combinations of possible
  ## data removals from the two groups:
  to_be_removed <- expand.grid(group1 = 0:remove, group2 = 0:remove)
  to_be_removed <- to_be_removed[rowSums(to_be_removed) == remove, ]
  # The following line would remove all six in either one group or the other:
  # to_be_removed <- data.frame(group1 = c(0, 6), group2 = c(6, 0))

  for (i in 1:nrow(to_be_removed)) {
    ## Get indices of values that are not removed:
    indices <- get_indices(to_be_removed, i, dat1, dat2)
    ## Test all combinations of removals of extreme values:
    for (j in c(TRUE, FALSE)) {
      ## Sort both groups in decreasing and increasing order to capture
      ## all combinations of extreme values: (therefore, j = TRUE; FALSE)
      tmp1 <- sort(dat1, decreasing = j)[indices[[1]]]
      tmp2 <- sort(dat2, decreasing = !j)[indices[[2]]]
      ## Assert that the correct number of observations was removed:
      if (length(tmp1) + length(tmp2) != length(dat1) + length(dat2) - remove)
        stop("Something went wrong, the incorrect number of cases was removed")
      ttest <- t.test(tmp1, tmp2, var.equal = TRUE)
      if (ttest$p.value < best_p)
        best_p <- ttest$p.value
    }
  }
  return(best_p)
}

#' Get indices for selection of non-extreme values
#'
#' to_be_removed = data frame that contains the number of observation
#'   to be removed by group (created within function `get_best_pvalue`)
#' row = which row of `to_be_removed` is targetted
#' dat1 = vector of numeric values for group 1
#' dat2 = vector of numeric values for group 2
#'
#' returns: A list of two elements containing the indices that are
#'   selected in each group

get_indices <- function(to_be_removed, row, dat1, dat2) {
  i1 <- to_be_removed[row, "group1"]
  i2 <- to_be_removed[row, "group2"]
  select_from1 <- setdiff(1:length(dat1), 0:i1)
  select_from2 <- setdiff(1:length(dat2), 0:i2)
  return(list(select_from1, select_from2))
}


# Run 10000 simulations
sim1 <- ttest_sim(10000)

mean(sim1 < .05)
# 0.7722
	## Warning: This code is just for fun / educational purposes; the file contains functions
	## to find out how severely the p value in a t-test can be minimized by systematic removal of data points.

	## SIX OUT OF THIRTY - Martin's approach
	## Based on @juli_tkotz's (https://twitter.com/juli_tkotz/status/1085446224117985281)
	## idea that removing from the most extreme values is the best apporach.


	#' Simulate t-tests and store best p values
	#'
	#' nsim = How many iterations in the simulation
	#'
	#' return: For each run, the best p value that could be obtained by
	#' removing 6 cases from the sample
	#'
	ttest_sim <- function(nsim) {
	# prepare matrix to be filled with values of t-test
	p_values <- rep(NA, nsim)
	# p value, t value, effect size
	for (i in 1:nsim) {
	# both groups: 30 values drawn from a normal distribution, mean = 0, SD = 1
	dat1 <- rnorm(30)
	dat2 <- rnorm(30)
	p_values[i] <- get_best_pvalue(dat1, dat2, remove = 6)
	}
	return(p_values)
	}

	#' Lowest p value for the comparison of two group means with fixed number
	#' of observations removed
	#'
	#' dat1 = vector of numeric values for group 1
	#' dat2 = vector of numeric values for group 2
	#' remove = how many observations have to be removed
	#'
	#' returns: The best possible p value for the comparison of the two
	#' groups when cases are removed
	#'
	get_best_pvalue <- function(dat1, dat2, remove) {
	## Initialize best p value:
	best_p <- Inf
	## Generate data frame containing all combinations of possible
	## data removals from the two groups:
	to_be_removed <- expand.grid(group1 = 0:remove, group2 = 0:remove)
	to_be_removed <- to_be_removed[rowSums(to_be_removed) == remove, ]
	# The following line would remove all six in either one group or the other:
	# to_be_removed <- data.frame(group1 = c(0, 6), group2 = c(6, 0))

	for (i in 1:nrow(to_be_removed)) {
	## Get indices of values that are not removed:
	indices <- get_indices(to_be_removed, i, dat1, dat2)
	## Test all combinations of removals of extreme values:
	for (j in c(TRUE, FALSE)) {
	## Sort both groups in decreasing and increasing order to capture
	## all combinations of extreme values: (therefore, j = TRUE; FALSE)
	tmp1 <- sort(dat1, decreasing = j)[indices[[1]]]
	tmp2 <- sort(dat2, decreasing = !j)[indices[[2]]]
	## Assert that the correct number of observations was removed:
	if (length(tmp1) + length(tmp2) != length(dat1) + length(dat2) - remove)
	stop("Something went wrong, the incorrect number of cases was removed")
	ttest <- t.test(tmp1, tmp2, var.equal = TRUE)
	if (ttest$p.value < best_p)
	best_p <- ttest$p.value
	}
	}
	return(best_p)
	}

	#' Get indices for selection of non-extreme values
	#'
	#' to_be_removed = data frame that contains the number of observation
	#' to be removed by group (created within function `get_best_pvalue`)
	#' row = which row of `to_be_removed` is targetted
	#' dat1 = vector of numeric values for group 1
	#' dat2 = vector of numeric values for group 2
	#'
	#' returns: A list of two elements containing the indices that are
	#' selected in each group

	get_indices <- function(to_be_removed, row, dat1, dat2) {
	i1 <- to_be_removed[row, "group1"]
	i2 <- to_be_removed[row, "group2"]
	select_from1 <- setdiff(1:length(dat1), 0:i1)
	select_from2 <- setdiff(1:length(dat2), 0:i2)
	return(list(select_from1, select_from2))
	}


	# Run 10000 simulations
	sim1 <- ttest_sim(10000)

	mean(sim1 < .05)
	# 0.7722