mja/typeI.R

## typeI.R
library("ggplot2")
library(plyr)

n.sims <- 100
max.n.vars <- 100
n.obs <- 100

res <- data.frame()

for (sim in 1:n.sims)
{
	for (n.vars in 1:max.n.vars)
	{
		y <- rnorm(n.obs)
		x <- matrix(NA, nrow = n.obs, ncol = n.vars)
		for (j in 1:n.vars)
		{
			x[, j] <- rnorm(n.obs)
		}
		fit <- lm(y ~ x - 1)
		p.vals <- coef(summary(fit))[, 4]
		false.positives <- sum(p.vals < 0.05)
		res <- rbind(res, data.frame(Sim = sim, Vars = n.vars, FP = false.positives))
	}
}

ggplot(res, aes(x = Vars, y = FP)) +
    geom_smooth() +
    xlab("Number of Variables in Multiple Regression") +
    ylab("Average Number of Variables for which p < 0.05") +
    ggtitle("Number of False Positives Grows Linearly with Number of Variables")
ggsave("false_positives.pdf")

# False positive rate per number of variables
Rate <- ddply(res, .(Vars), function(x) data.frame(rate=sum(x$FP) / sum(x$Vars)))


ggplot(Rate, aes(x=Vars, y=rate)) +
  geom_point() +
  stat_smooth(method=lm) +
  xlab("Number of Variables in Multiple Regression") +
  ylab("False positive rate") +
  ggtitle("Rate of False Positives is Constant")

# Family-wise error rate = % of times we get any false positives for a
# given number of predictors
FWER <- ddply(res, .(Vars), function(x) data.frame(fwer=mean(x$FP > 0)))

ggplot(FWER, aes(x=Vars, y=fwer)) +
  geom_point() +
  stat_smooth() +
  scale_y_continuous(limits=c(0, 1)) +
  xlab("Number of Variables in Multiple Regression") +
  ylab("Familywise error rate") +
  ggtitle("Familywise error rate increases until parameters saturate the data")
	library("ggplot2")
	library(plyr)

	n.sims <- 100
	max.n.vars <- 100
	n.obs <- 100

	res <- data.frame()

	for (sim in 1:n.sims)
	{
	for (n.vars in 1:max.n.vars)
	{
	y <- rnorm(n.obs)
	x <- matrix(NA, nrow = n.obs, ncol = n.vars)
	for (j in 1:n.vars)
	{
	x[, j] <- rnorm(n.obs)
	}
	fit <- lm(y ~ x - 1)
	p.vals <- coef(summary(fit))[, 4]
	false.positives <- sum(p.vals < 0.05)
	res <- rbind(res, data.frame(Sim = sim, Vars = n.vars, FP = false.positives))
	}
	}

	ggplot(res, aes(x = Vars, y = FP)) +
	geom_smooth() +
	xlab("Number of Variables in Multiple Regression") +
	ylab("Average Number of Variables for which p < 0.05") +
	ggtitle("Number of False Positives Grows Linearly with Number of Variables")
	ggsave("false_positives.pdf")

	# False positive rate per number of variables
	Rate <- ddply(res, .(Vars), function(x) data.frame(rate=sum(x$FP) / sum(x$Vars)))


	ggplot(Rate, aes(x=Vars, y=rate)) +
	geom_point() +
	stat_smooth(method=lm) +
	xlab("Number of Variables in Multiple Regression") +
	ylab("False positive rate") +
	ggtitle("Rate of False Positives is Constant")

	# Family-wise error rate = % of times we get any false positives for a
	# given number of predictors
	FWER <- ddply(res, .(Vars), function(x) data.frame(fwer=mean(x$FP > 0)))

	ggplot(FWER, aes(x=Vars, y=fwer)) +
	geom_point() +
	stat_smooth() +
	scale_y_continuous(limits=c(0, 1)) +
	xlab("Number of Variables in Multiple Regression") +
	ylab("Familywise error rate") +
	ggtitle("Familywise error rate increases until parameters saturate the data")