devmacrile/wrong-cv-example.r

## wrong-cv-example.r
# Example of a common cross-validation mistake
# Described in The Elements of Statistical Learning, 7.10.2
# http://statweb.stanford.edu/~tibs/ElemStatLearn/
#
# Consider a scenario with
# N = 50 samples in two equal-sized classes, and p = 5000 quantitative
# predictors (standard Gaussian) that are independent of the class labels.
# The true (test) error rate of any classifier is 50%. We carried out the above
# recipe, choosing in step (1) the 100 predictors having highest correlation
# with the class labels, and then using a 1-nearest neighbor classifier, based
# on just these 100 predictors, in step (2). Over 50 simulations from this
# setting, the average CV error rate was 3%. This is far lower than the true
# error rate of 50%.
# What has happened? The problem is that the predictors have an unfair
# advantage, as they were chosen in step (1) on the basis of all of the samples.
# Leaving samples out after the variables have been selected does not correctly
# mimic the application of the classifier to a completely independent
# test set, since these predictors “have already seen” the left out samples.
# Figure 7.10 (top panel) illustrates the problem. We selected the 100 predictors
# having largest correlation with the class labels over all 50 samples.
# Then we chose a random set of 10 samples, as we would do in five-fold crossvalidation,
# and computed the correlations of the pre-selected 100 predictors
# with the class labels over just these 10 samples (top panel). We see that
# the correlations average about 0.28, rather than 0, as one might expect


simulation <- function(){

  library(class)  #  For knn implementation

  # Initialize data.frame with outcome class
  df <- data.frame(class = c(rep(0,25),
                rep(1,25))[sample(1:50, 50)])

  # Generate 5000 standard Gaussian predictors
  num.pred <- 5000
  for(i in 1:num.pred){
    df[, i+1] <- rnorm(50)
  }

  # Calculate the 100 predictors most correlated with the class labels
  corr <- apply(df[, -1], 2,
              function(x) cor(x, df[, 1]))
  hcp <- names(sort(corr, decreasing=TRUE))[1:100]  # 100 highest correlated predictors

  # Subset data to just selected predictors
  new.df <- df[, c("class", hcp)]
  rows <- nrow(new.df)

  k <- 5
  folds <- sample(1:rows, rows)%%k + 1
  err <- rep(0, k)
  for(i in 1:k){
    in.train <- which(folds == i)
    train <- new.df[in.train, ]
    test <- new.df[-in.train, ]
    cl <- factor(new.df$class[in.train])
    nn <- knn(train[, -1], test[, -1], cl, k=1)
    err[i] <- length(which(nn != test$class))/dim(test)[1]
  }
  cv.error <- mean(err)
  cv.error
}


set.seed(42)
sims <- 50
sim.error <- rep(0, sims)
for(j in 1:sims){
  sim.error[j] <- simulation()
}

#sim.error
# [1] 0.125 0.130 0.085 0.045 0.075 0.185 0.090 0.080 0.060 0.075 0.085 0.090
# [13] 0.115 0.060 0.100 0.140 0.120 0.080 0.040 0.140 0.125 0.105 0.045 0.060
# [25] 0.120 0.080 0.035 0.060 0.080 0.075 0.060 0.090 0.095 0.075 0.045 0.130
# [37] 0.080 0.100 0.110 0.055 0.140 0.065 0.060 0.165 0.060 0.060 0.055 0.090
# [49] 0.050 0.045

# mean(sim.error)
# 0.0867

# Getting .0867 as the average error, where as they report 3%.  Not sure if this is an
# error on my part or not.
	# Example of a common cross-validation mistake
	# Described in The Elements of Statistical Learning, 7.10.2
	# http://statweb.stanford.edu/~tibs/ElemStatLearn/
	#
	# Consider a scenario with
	# N = 50 samples in two equal-sized classes, and p = 5000 quantitative
	# predictors (standard Gaussian) that are independent of the class labels.
	# The true (test) error rate of any classifier is 50%. We carried out the above
	# recipe, choosing in step (1) the 100 predictors having highest correlation
	# with the class labels, and then using a 1-nearest neighbor classifier, based
	# on just these 100 predictors, in step (2). Over 50 simulations from this
	# setting, the average CV error rate was 3%. This is far lower than the true
	# error rate of 50%.
	# What has happened? The problem is that the predictors have an unfair
	# advantage, as they were chosen in step (1) on the basis of all of the samples.
	# Leaving samples out after the variables have been selected does not correctly
	# mimic the application of the classifier to a completely independent
	# test set, since these predictors “have already seen” the left out samples.
	# Figure 7.10 (top panel) illustrates the problem. We selected the 100 predictors
	# having largest correlation with the class labels over all 50 samples.
	# Then we chose a random set of 10 samples, as we would do in five-fold crossvalidation,
	# and computed the correlations of the pre-selected 100 predictors
	# with the class labels over just these 10 samples (top panel). We see that
	# the correlations average about 0.28, rather than 0, as one might expect


	simulation <- function(){

	library(class) # For knn implementation

	# Initialize data.frame with outcome class
	df <- data.frame(class = c(rep(0,25),
	rep(1,25))[sample(1:50, 50)])

	# Generate 5000 standard Gaussian predictors
	num.pred <- 5000
	for(i in 1:num.pred){
	df[, i+1] <- rnorm(50)
	}

	# Calculate the 100 predictors most correlated with the class labels
	corr <- apply(df[, -1], 2,
	function(x) cor(x, df[, 1]))
	hcp <- names(sort(corr, decreasing=TRUE))[1:100] # 100 highest correlated predictors

	# Subset data to just selected predictors
	new.df <- df[, c("class", hcp)]
	rows <- nrow(new.df)

	k <- 5
	folds <- sample(1:rows, rows)%%k + 1
	err <- rep(0, k)
	for(i in 1:k){
	in.train <- which(folds == i)
	train <- new.df[in.train, ]
	test <- new.df[-in.train, ]
	cl <- factor(new.df$class[in.train])
	nn <- knn(train[, -1], test[, -1], cl, k=1)
	err[i] <- length(which(nn != test$class))/dim(test)[1]
	}
	cv.error <- mean(err)
	cv.error
	}


	set.seed(42)
	sims <- 50
	sim.error <- rep(0, sims)
	for(j in 1:sims){
	sim.error[j] <- simulation()
	}

	#sim.error
	# [1] 0.125 0.130 0.085 0.045 0.075 0.185 0.090 0.080 0.060 0.075 0.085 0.090
	# [13] 0.115 0.060 0.100 0.140 0.120 0.080 0.040 0.140 0.125 0.105 0.045 0.060
	# [25] 0.120 0.080 0.035 0.060 0.080 0.075 0.060 0.090 0.095 0.075 0.045 0.130
	# [37] 0.080 0.100 0.110 0.055 0.140 0.065 0.060 0.165 0.060 0.060 0.055 0.090
	# [49] 0.050 0.045

	# mean(sim.error)
	# 0.0867

	# Getting .0867 as the average error, where as they report 3%. Not sure if this is an
	# error on my part or not.