zdealveindy/P_values_and_multiple_testing.r

## P_values_and_multiple_testing.r
# Single test ----

# Scenarion 1: correlation between two randomly generated variables (null hypothesis is true) ----
x.r <- rnorm (100)
y.r <- rnorm (100)
cor.test (x, y)$p.value

# Scenario 2: correlation between two dependent variables (null hypothesis is false), single test ----
x.d <- rnorm (100)
y.d <- 0.2 * x.d + rnorm (100)
cor.test (x.d, y.d)$p.value

# Multiple tests (100 relationships, 10 traits and 10 environmental variables) ----
# Scenario 1: all null hypotheses are true ----
png (file = 'multiple_null_true_1.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
set.seed (321)
par (mfrow = c(10, 10))
par (mar = c(1,1,1,1))
for (i in seq (1, 100))
{
  x <- rnorm (100)
  y <- rnorm (100)
  plot (y ~ x, pch = 16, col = 'grey', cex = .4, axes = F, main = i)
  box ()
  lm.res <- lm (y ~ x)
  p.value <- anova (lm.res)$'Pr(>F)'[1]
  if (p.value < 0.05) abline (lm.res, col = 'red', lwd = 3)
}
dev.off ()

# repeat
png (file = 'multiple_null_true_2.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
set.seed (3210)
par (mfrow = c(10, 10))
par (mar = c(1,1,1,1))
for (i in seq (1, 100))
{
  x <- rnorm (100)
  y <- rnorm (100)
  plot (y ~ x, pch = 16, col = 'grey', cex = .4, axes = F, main = i)
  box ()
  lm.res <- lm (y ~ x)
  p.value <- anova (lm.res)$'Pr(>F)'[1]
  if (p.value < 0.05) abline (lm.res, col = 'red', lwd = 3)
}
dev.off ()

# Scenario 2: all null hypotheses are false ----
png (file = 'multiple_null_false_1.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
set.seed (1234)
par (mfrow = c(10, 10))
par (mar = c(1,1,1,1))
for (i in seq (1, 100))
{
  x <- rnorm (100)
  y <- 0.2 * x + rnorm (100)  # y is dependent on x + added noise
  plot (y ~ x, axes = F, main = i, type = 'n')
  rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#ADD8E680")
  points (y ~ x, pch = 16, col = 'grey', cex = .4)
  box (col = 'blue')
  lm.res <- lm (y ~ x)
  p.value <- anova (lm.res)$'Pr(>F)'[1]
  if (p.value < 0.05) abline (lm.res, col = 'red', lwd = 3)
}
dev.off ()

# repeat
png (file = 'multiple_null_false_2.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
set.seed (12345)
par (mfrow = c(10, 10))
par (mar = c(1,1,1,1))
for (i in seq (1, 100))
{
  x <- rnorm (100)
  y <- 0.2 * x + rnorm (100)  # y is dependent on x + added noise
  plot (y ~ x, axes = F, main = i, type = 'n')
  rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#ADD8E680")
  points (y ~ x, pch = 16, col = 'grey', cex = .4)
  box (col = 'blue')
  lm.res <- lm (y ~ x)
  p.value <- anova (lm.res)$'Pr(>F)'[1]
  if (p.value < 0.05) abline (lm.res, col = 'red', lwd = 3)
}
dev.off ()

# Scenario 3: first 10 combnations with null false, other 90 with null true ----
png (file = 'multiple_null_false_true_1.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
set.seed (345)
par (mfrow = c(10, 10))
par (mar = c(1,1,1,1))
b <- c(rep (0.2, 10), rep (0, 90))
for (i in seq (1, 100))
{
  x <- rnorm (100)
  y <- b[i] * x + rnorm (100)  # y is dependent on x (but only for b > 0) + added noise
  plot (y ~ x, axes = F, main = i, type = 'n')
  if (b[i] > 0)
    rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#ADD8E680")
  points (y ~ x, pch = 16, col = 'grey', cex = .4)
  box (col = if (b[i] > 0) 'blue' else 'black')
  lm.res <- lm (y ~ x)
  p.value <- anova (lm.res)$'Pr(>F)'[1]
  if (p.value < 0.05) abline (lm.res, col = 'red', lwd = 3)
}
dev.off ()

png (file = 'multiple_null_false_true_2.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
set.seed (3456)
par (mfrow = c(10, 10))
par (mar = c(1,1,1,1))
b <- c(rep (0.2, 10), rep (0, 90))
for (i in seq (1, 100))
{
  x <- rnorm (100)
  y <- b[i] * x + rnorm (100)  # y is dependent on x (but only for b > 0) + added noise
  plot (y ~ x, axes = F, main = i, type = 'n')
  if (b[i] > 0)
    rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#ADD8E680")
  points (y ~ x, pch = 16, col = 'grey', cex = .4)
  box (col = if (b[i] > 0) 'blue' else 'black')
  lm.res <- lm (y ~ x)
  p.value <- anova (lm.res)$'Pr(>F)'[1]
  if (p.value < 0.05) abline (lm.res, col = 'red', lwd = 3)
}
dev.off ()

# Apply Bonferroni ----
png (file = 'multiple_null_false_true_bonf.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
set.seed (3456)
par (mfrow = c(10, 10))
par (mar = c(1,1,1,1))
b <- c(rep (0.2, 10), rep (0, 90))
for (i in seq (1, 100))
{
  x <- rnorm (100)
  y <- b[i] * x + rnorm (100)  # y is dependent on x (but only for b > 0) + added noise
  plot (y ~ x, axes = F, main = i, type = 'n')
  if (b[i] > 0)
    rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#ADD8E680")
  points (y ~ x, pch = 16, col = 'grey', cex = .4)
  box (col = if (b[i] > 0) 'blue' else 'black')
  lm.res <- lm (y ~ x)
  p.value <- anova (lm.res)$'Pr(>F)'[1]
  p.adj <- p.adjust (p.value, n = 100, method = 'bonferroni')
  if (p.adj < 0.05) abline (lm.res, col = 'red', lwd = 3)
}
dev.off ()

# Icrease sample size (power) and apply Bonferroni ----
png (file = 'multiple_null_false_true_bonf_increased-samplesize.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
set.seed (345678)
par (mfrow = c(10, 10))
par (mar = c(1,1,1,1))
b <- c(rep (0.2, 10), rep (0, 90))
for (i in seq (1, 100))
{
  x <- rnorm (500)
  y <- b[i] * x + rnorm (500)  # y is dependent on x (but only for b > 0) + added noise
  plot (y ~ x, axes = F, main = i, type = 'n')
  if (b[i] > 0)
    rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#ADD8E680")
  points (y ~ x, pch = 16, col = 'grey', cex = .4)
  box (col = if (b[i] > 0) 'blue' else 'black')
  lm.res <- lm (y ~ x)
  p.value <- anova (lm.res)$'Pr(>F)'[1]
  p.adj <- p.adjust (p.value, n = 100, method = 'bonferroni')
  if (p.adj < 0.05) abline (lm.res, col = 'red', lwd = 3)
}
dev.off ()

# Estimate the power of the test
# for regression with b = 0.2 and 100 samples
P.val.100 <- replicate (10000, {
  x <- rnorm (100)
  y <- 0.2 * x + rnorm (100)
  anova (lm (y ~ x))$'Pr(>F)'[1]
})
sum (P.val.100 < 0.05)/10000  # 0.504

# for regression with b = 0.2 and 100 samples
P.val.500 <- replicate (10000, {
  x <- rnorm (500)
  y <- 0.2 * x + rnorm (500)
  anova (lm (y ~ x))$'Pr(>F)'[1]
})
sum (P.val.500 < 0.05)/10000  # 0.993
	# Single test ----

	# Scenarion 1: correlation between two randomly generated variables (null hypothesis is true) ----
	x.r <- rnorm (100)
	y.r <- rnorm (100)
	cor.test (x, y)$p.value

	# Scenario 2: correlation between two dependent variables (null hypothesis is false), single test ----
	x.d <- rnorm (100)
	y.d <- 0.2 * x.d + rnorm (100)
	cor.test (x.d, y.d)$p.value

	# Multiple tests (100 relationships, 10 traits and 10 environmental variables) ----
	# Scenario 1: all null hypotheses are true ----
	png (file = 'multiple_null_true_1.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
	set.seed (321)
	par (mfrow = c(10, 10))
	par (mar = c(1,1,1,1))
	for (i in seq (1, 100))
	{
	x <- rnorm (100)
	y <- rnorm (100)
	plot (y ~ x, pch = 16, col = 'grey', cex = .4, axes = F, main = i)
	box ()
	lm.res <- lm (y ~ x)
	p.value <- anova (lm.res)$'Pr(>F)'[1]
	if (p.value < 0.05) abline (lm.res, col = 'red', lwd = 3)
	}
	dev.off ()

	# repeat
	png (file = 'multiple_null_true_2.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
	set.seed (3210)
	par (mfrow = c(10, 10))
	par (mar = c(1,1,1,1))
	for (i in seq (1, 100))
	{
	x <- rnorm (100)
	y <- rnorm (100)
	plot (y ~ x, pch = 16, col = 'grey', cex = .4, axes = F, main = i)
	box ()
	lm.res <- lm (y ~ x)
	p.value <- anova (lm.res)$'Pr(>F)'[1]
	if (p.value < 0.05) abline (lm.res, col = 'red', lwd = 3)
	}
	dev.off ()

	# Scenario 2: all null hypotheses are false ----
	png (file = 'multiple_null_false_1.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
	set.seed (1234)
	par (mfrow = c(10, 10))
	par (mar = c(1,1,1,1))
	for (i in seq (1, 100))
	{
	x <- rnorm (100)
	y <- 0.2 * x + rnorm (100) # y is dependent on x + added noise
	plot (y ~ x, axes = F, main = i, type = 'n')
	rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#ADD8E680")
	points (y ~ x, pch = 16, col = 'grey', cex = .4)
	box (col = 'blue')
	lm.res <- lm (y ~ x)
	p.value <- anova (lm.res)$'Pr(>F)'[1]
	if (p.value < 0.05) abline (lm.res, col = 'red', lwd = 3)
	}
	dev.off ()

	# repeat
	png (file = 'multiple_null_false_2.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
	set.seed (12345)
	par (mfrow = c(10, 10))
	par (mar = c(1,1,1,1))
	for (i in seq (1, 100))
	{
	x <- rnorm (100)
	y <- 0.2 * x + rnorm (100) # y is dependent on x + added noise
	plot (y ~ x, axes = F, main = i, type = 'n')
	rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#ADD8E680")
	points (y ~ x, pch = 16, col = 'grey', cex = .4)
	box (col = 'blue')
	lm.res <- lm (y ~ x)
	p.value <- anova (lm.res)$'Pr(>F)'[1]
	if (p.value < 0.05) abline (lm.res, col = 'red', lwd = 3)
	}
	dev.off ()

	# Scenario 3: first 10 combnations with null false, other 90 with null true ----
	png (file = 'multiple_null_false_true_1.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
	set.seed (345)
	par (mfrow = c(10, 10))
	par (mar = c(1,1,1,1))
	b <- c(rep (0.2, 10), rep (0, 90))
	for (i in seq (1, 100))
	{
	x <- rnorm (100)
	y <- b[i] * x + rnorm (100) # y is dependent on x (but only for b > 0) + added noise
	plot (y ~ x, axes = F, main = i, type = 'n')
	if (b[i] > 0)
	rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#ADD8E680")
	points (y ~ x, pch = 16, col = 'grey', cex = .4)
	box (col = if (b[i] > 0) 'blue' else 'black')
	lm.res <- lm (y ~ x)
	p.value <- anova (lm.res)$'Pr(>F)'[1]
	if (p.value < 0.05) abline (lm.res, col = 'red', lwd = 3)
	}
	dev.off ()

	png (file = 'multiple_null_false_true_2.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
	set.seed (3456)
	par (mfrow = c(10, 10))
	par (mar = c(1,1,1,1))
	b <- c(rep (0.2, 10), rep (0, 90))
	for (i in seq (1, 100))
	{
	x <- rnorm (100)
	y <- b[i] * x + rnorm (100) # y is dependent on x (but only for b > 0) + added noise
	plot (y ~ x, axes = F, main = i, type = 'n')
	if (b[i] > 0)
	rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#ADD8E680")
	points (y ~ x, pch = 16, col = 'grey', cex = .4)
	box (col = if (b[i] > 0) 'blue' else 'black')
	lm.res <- lm (y ~ x)
	p.value <- anova (lm.res)$'Pr(>F)'[1]
	if (p.value < 0.05) abline (lm.res, col = 'red', lwd = 3)
	}
	dev.off ()

	# Apply Bonferroni ----
	png (file = 'multiple_null_false_true_bonf.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
	set.seed (3456)
	par (mfrow = c(10, 10))
	par (mar = c(1,1,1,1))
	b <- c(rep (0.2, 10), rep (0, 90))
	for (i in seq (1, 100))
	{
	x <- rnorm (100)
	y <- b[i] * x + rnorm (100) # y is dependent on x (but only for b > 0) + added noise
	plot (y ~ x, axes = F, main = i, type = 'n')
	if (b[i] > 0)
	rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#ADD8E680")
	points (y ~ x, pch = 16, col = 'grey', cex = .4)
	box (col = if (b[i] > 0) 'blue' else 'black')
	lm.res <- lm (y ~ x)
	p.value <- anova (lm.res)$'Pr(>F)'[1]
	p.adj <- p.adjust (p.value, n = 100, method = 'bonferroni')
	if (p.adj < 0.05) abline (lm.res, col = 'red', lwd = 3)
	}
	dev.off ()

	# Icrease sample size (power) and apply Bonferroni ----
	png (file = 'multiple_null_false_true_bonf_increased-samplesize.png', width = 6, height = 6, res = 300, units = 'in', pointsize = 12)
	set.seed (345678)
	par (mfrow = c(10, 10))
	par (mar = c(1,1,1,1))
	b <- c(rep (0.2, 10), rep (0, 90))
	for (i in seq (1, 100))
	{
	x <- rnorm (500)
	y <- b[i] * x + rnorm (500) # y is dependent on x (but only for b > 0) + added noise
	plot (y ~ x, axes = F, main = i, type = 'n')
	if (b[i] > 0)
	rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#ADD8E680")
	points (y ~ x, pch = 16, col = 'grey', cex = .4)
	box (col = if (b[i] > 0) 'blue' else 'black')
	lm.res <- lm (y ~ x)
	p.value <- anova (lm.res)$'Pr(>F)'[1]
	p.adj <- p.adjust (p.value, n = 100, method = 'bonferroni')
	if (p.adj < 0.05) abline (lm.res, col = 'red', lwd = 3)
	}
	dev.off ()

	# Estimate the power of the test
	# for regression with b = 0.2 and 100 samples
	P.val.100 <- replicate (10000, {
	x <- rnorm (100)
	y <- 0.2 * x + rnorm (100)
	anova (lm (y ~ x))$'Pr(>F)'[1]
	})
	sum (P.val.100 < 0.05)/10000 # 0.504

	# for regression with b = 0.2 and 100 samples
	P.val.500 <- replicate (10000, {
	x <- rnorm (500)
	y <- 0.2 * x + rnorm (500)
	anova (lm (y ~ x))$'Pr(>F)'[1]
	})
	sum (P.val.500 < 0.05)/10000 # 0.993