Skip to content

Instantly share code, notes, and snippets.

@mkowoods
Last active August 29, 2015 14:10
Show Gist options
  • Save mkowoods/797e8b9bf446fdcd209f to your computer and use it in GitHub Desktop.
Save mkowoods/797e8b9bf446fdcd209f to your computer and use it in GitHub Desktop.
Playing with Test and Training Set Error
n_col = 100
n_rows = 1000
set.seed(500)
rand.matrix <- matrix(data = rep(NA, n_rows*n_col), nrow = n_rows, ncol = n_col)
rand.class <- rbinom(n = n_rows, size = 1, prob = 0.7)
#Assign some variables to be randomly relevant
for(i in 1:n_col){
predictor <- rnorm(n_rows)
if(i <= 5){
rand.matrix[, i] <- predictor + rand.class*rnorm(n_rows)
}else{
rand.matrix[, i] <- predictor
}
}
rand.data <- as.data.frame(cbind(rand.class, rand.matrix))
training <- sample(1:n_rows, 0.6*n_rows)
test <- -sample(1:n_rows, 0.6*n_rows)
training.acc <- rep(NA, n_col)
test.acc <- rep(NA, n_col)
ordered.cor <- order(cor(rand.data)[1:(n_col + 1), 1], decreasing = T)
for(i in 1:n_col){
m1 <- glm(rand.class~., rand.data[training, ordered.cor[1:(1+i)]], family="binomial")
training.acc[i] <- sum((predict(m1, rand.data[training,], type = "response") > 0.5) == rand.data[training,]$rand.class)/length(training)
test.acc[i] <- sum((predict(m1, rand.data[test,], type = "response") > 0.5) == rand.data[test,]$rand.class)/(n_rows - length(training))
}
tmp <- data.frame(idx = 1:n_col, training.acc = training.acc, test.acc = test.acc)
g <- ggplot(data = tmp, aes(idx))
g <- g + geom_point(aes(y = training.acc)) +geom_smooth(aes(y = training.acc), method = "loess")
g <- g + geom_point(aes(y = test.acc), color = "red") +geom_smooth(aes(y = test.acc), method = "loess")
g <- g + ylab("accuracy")
print(g)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment