mkowoods/gist:797e8b9bf446fdcd209f

## gistfile1.r
n_col = 100
n_rows = 1000

set.seed(500)

rand.matrix <- matrix(data = rep(NA, n_rows*n_col), nrow = n_rows, ncol = n_col)
rand.class <-  rbinom(n = n_rows, size = 1, prob = 0.7)

#Assign some variables to be randomly relevant
for(i in 1:n_col){
    predictor <- rnorm(n_rows)
    if(i <= 5){
        rand.matrix[, i] <- predictor + rand.class*rnorm(n_rows)
    }else{
        rand.matrix[, i] <- predictor
    }
}

rand.data <- as.data.frame(cbind(rand.class, rand.matrix))


training <- sample(1:n_rows, 0.6*n_rows)
test <- -sample(1:n_rows, 0.6*n_rows)

training.acc <- rep(NA, n_col)
test.acc <- rep(NA, n_col)


ordered.cor <- order(cor(rand.data)[1:(n_col + 1), 1], decreasing = T)

for(i in 1:n_col){

        m1 <- glm(rand.class~., rand.data[training, ordered.cor[1:(1+i)]], family="binomial")
        training.acc[i] <- sum((predict(m1, rand.data[training,], type = "response") > 0.5) == rand.data[training,]$rand.class)/length(training)
        test.acc[i] <-  sum((predict(m1, rand.data[test,], type = "response") > 0.5) == rand.data[test,]$rand.class)/(n_rows - length(training))
}

tmp <- data.frame(idx = 1:n_col, training.acc = training.acc, test.acc = test.acc)

g <- ggplot(data = tmp, aes(idx))
g <- g + geom_point(aes(y = training.acc)) +geom_smooth(aes(y = training.acc), method = "loess")
g <- g + geom_point(aes(y = test.acc), color = "red") +geom_smooth(aes(y = test.acc), method = "loess")
g <- g + ylab("accuracy")
print(g)
	n_col = 100
	n_rows = 1000

	set.seed(500)

	rand.matrix <- matrix(data = rep(NA, n_rows*n_col), nrow = n_rows, ncol = n_col)
	rand.class <- rbinom(n = n_rows, size = 1, prob = 0.7)

	#Assign some variables to be randomly relevant
	for(i in 1:n_col){
	predictor <- rnorm(n_rows)
	if(i <= 5){
	rand.matrix[, i] <- predictor + rand.class*rnorm(n_rows)
	}else{
	rand.matrix[, i] <- predictor
	}
	}

	rand.data <- as.data.frame(cbind(rand.class, rand.matrix))


	training <- sample(1:n_rows, 0.6*n_rows)
	test <- -sample(1:n_rows, 0.6*n_rows)

	training.acc <- rep(NA, n_col)
	test.acc <- rep(NA, n_col)


	ordered.cor <- order(cor(rand.data)[1:(n_col + 1), 1], decreasing = T)

	for(i in 1:n_col){

	m1 <- glm(rand.class~., rand.data[training, ordered.cor[1:(1+i)]], family="binomial")
	training.acc[i] <- sum((predict(m1, rand.data[training,], type = "response") > 0.5) == rand.data[training,]$rand.class)/length(training)
	test.acc[i] <- sum((predict(m1, rand.data[test,], type = "response") > 0.5) == rand.data[test,]$rand.class)/(n_rows - length(training))
	}

	tmp <- data.frame(idx = 1:n_col, training.acc = training.acc, test.acc = test.acc)

	g <- ggplot(data = tmp, aes(idx))
	g <- g + geom_point(aes(y = training.acc)) +geom_smooth(aes(y = training.acc), method = "loess")
	g <- g + geom_point(aes(y = test.acc), color = "red") +geom_smooth(aes(y = test.acc), method = "loess")
	g <- g + ylab("accuracy")
	print(g)