Skip to content

Instantly share code, notes, and snippets.

@m-Py
Created January 16, 2019 13:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save m-Py/8ce68201dd0fd498504e00e2e2f38682 to your computer and use it in GitHub Desktop.
Save m-Py/8ce68201dd0fd498504e00e2e2f38682 to your computer and use it in GitHub Desktop.
How the p value in a t test can be minimized by data removal
## Warning: This code is just for fun / educational purposes; the file contains functions
## to find out how severely the p value in a t-test can be minimized by systematic removal of data points.
## SIX OUT OF THIRTY - Martin's approach
## Based on @juli_tkotz's (https://twitter.com/juli_tkotz/status/1085446224117985281)
## idea that removing from the most extreme values is the best apporach.
#' Simulate t-tests and store best p values
#'
#' nsim = How many iterations in the simulation
#'
#' return: For each run, the best p value that could be obtained by
#' removing 6 cases from the sample
#'
ttest_sim <- function(nsim) {
# prepare matrix to be filled with values of t-test
p_values <- rep(NA, nsim)
# p value, t value, effect size
for (i in 1:nsim) {
# both groups: 30 values drawn from a normal distribution, mean = 0, SD = 1
dat1 <- rnorm(30)
dat2 <- rnorm(30)
p_values[i] <- get_best_pvalue(dat1, dat2, remove = 6)
}
return(p_values)
}
#' Lowest p value for the comparison of two group means with fixed number
#' of observations removed
#'
#' dat1 = vector of numeric values for group 1
#' dat2 = vector of numeric values for group 2
#' remove = how many observations have to be removed
#'
#' returns: The best possible p value for the comparison of the two
#' groups when cases are removed
#'
get_best_pvalue <- function(dat1, dat2, remove) {
## Initialize best p value:
best_p <- Inf
## Generate data frame containing all combinations of possible
## data removals from the two groups:
to_be_removed <- expand.grid(group1 = 0:remove, group2 = 0:remove)
to_be_removed <- to_be_removed[rowSums(to_be_removed) == remove, ]
# The following line would remove all six in either one group or the other:
# to_be_removed <- data.frame(group1 = c(0, 6), group2 = c(6, 0))
for (i in 1:nrow(to_be_removed)) {
## Get indices of values that are not removed:
indices <- get_indices(to_be_removed, i, dat1, dat2)
## Test all combinations of removals of extreme values:
for (j in c(TRUE, FALSE)) {
## Sort both groups in decreasing and increasing order to capture
## all combinations of extreme values: (therefore, j = TRUE; FALSE)
tmp1 <- sort(dat1, decreasing = j)[indices[[1]]]
tmp2 <- sort(dat2, decreasing = !j)[indices[[2]]]
## Assert that the correct number of observations was removed:
if (length(tmp1) + length(tmp2) != length(dat1) + length(dat2) - remove)
stop("Something went wrong, the incorrect number of cases was removed")
ttest <- t.test(tmp1, tmp2, var.equal = TRUE)
if (ttest$p.value < best_p)
best_p <- ttest$p.value
}
}
return(best_p)
}
#' Get indices for selection of non-extreme values
#'
#' to_be_removed = data frame that contains the number of observation
#' to be removed by group (created within function `get_best_pvalue`)
#' row = which row of `to_be_removed` is targetted
#' dat1 = vector of numeric values for group 1
#' dat2 = vector of numeric values for group 2
#'
#' returns: A list of two elements containing the indices that are
#' selected in each group
get_indices <- function(to_be_removed, row, dat1, dat2) {
i1 <- to_be_removed[row, "group1"]
i2 <- to_be_removed[row, "group2"]
select_from1 <- setdiff(1:length(dat1), 0:i1)
select_from2 <- setdiff(1:length(dat2), 0:i2)
return(list(select_from1, select_from2))
}
# Run 10000 simulations
sim1 <- ttest_sim(10000)
mean(sim1 < .05)
# 0.7722
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment