Skip to content

Instantly share code, notes, and snippets.

@tokestermw
Last active September 8, 2015 04:15
Show Gist options
  • Save tokestermw/ca7644fcf895b51adc73 to your computer and use it in GitHub Desktop.
Save tokestermw/ca7644fcf895b51adc73 to your computer and use it in GitHub Desktop.
More data is not necessarily better
library("lme4")
library("ggplot2")
library("plyr")
library(reshape2)
head(Dyestuff)
# Batch Yield
# 1 A 1545
# 2 A 1440
# 3 A 1440
# 4 A 1520
# 5 A 1580
# 6 B 1540
grp <- ddply(Dyestuff, "Batch", transform, batch.mean = mean(Yield))
overall.mean <- mean(grp$Yield)
ggplot(aes(Batch, Yield), data=grp) + geom_point() +
geom_point(aes(Batch, batch.mean), data=grp, colour="red", shape=2) +
geom_hline(y=overall.mean, colour="red")
mse <- function(E, O) {
mean((E - O)^2)
}
mse(overall.mean, Dyestuff$Yield)
# [1] 3839.583
mse(grp$batch.mean, Dyestuff$Yield) # equal to lm() prediction
# [1] 1961
model <- lmer(Yield ~ 1 | Batch, data=Dyestuff)
mse(predict(model), Dyestuff$Yield)
# [1] 2049.847
add.batch <- rep(Dyestuff$Batch, 4)
add.yield <- c(jitter(Dyestuff$Yield, 50), jitter(Dyestuff$Yield, 50), jitter(Dyestuff$Yield, 50), jitter(Dyestuff$Yield, 50))
Dyestuff <- rbind(Dyestuff, data.frame(Batch=add.batch, Yield=add.yield))
temp <- Dyestuff2
out <- data.frame()
for (i in seq(1, 10, 1)) {
sample.size <- dim(Dyestuff)[1] * i
overall.mean <- mean(temp$Yield)
overall.mse <- mse(overall.mean, temp$Yield)
preds <- predict(lm(Yield ~ 1 + Batch, data=temp))
lm.mse <- mse(preds, temp$Yield)
# preds <- predict(lmer(Yield ~ 1 | Batch, data=temp, REML=TRUE))
# lmer.mse <- mse(preds, temp$Yield)
out <- rbind(out, data.frame(sample.size, overall.mse, lm.mse))#, lmer.mse))
# add error towards shrinkage
add <- ddply(Dyestuff2, "Batch", transform, batch.mean=jitter(mean(Yield), .1))
# add residual error
add <- transform(add, batch.yield=rnorm(30, batch.mean, 10))
add.batch <- Dyestuff2$Batch
add.yield <- add$batch.yield
temp <- rbind(temp, data.frame(Batch=add.batch, Yield=add.yield))
}
m <- melt(out, id="sample.size")
ggplot(aes(x=sample.size, y=value, colour=variable), data=m) + geom_line() + expand_limits(y=0) + ylab('Error')
ggplot(aes(Batch, Yield), data=temp) + geom_point() +
geom_point(aes(Batch, batch.mean), data=grp, colour="red", shape=2) +
geom_hline(y=overall.mean, colour="red")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment