kkraoj/cv_wrong.rmd

## cv_wrong.rmd
```{r setup, include=FALSE}
rm(list=ls())
knitr::opts_chunk$set(echo = TRUE)
library(caret)
library(ggplot2)

```

# Cross validation Example: The wrong way and the right way

```{r make dataset}

set.seed(6)

n.features = 1e5 #total features in data (genes)
n.features.select = 1e2 #features to be selected after screening
n.examples = 50 #number of examples (or patients)

#create completely random labels of occurence of heart disease in patients
labels = round(runif(n.examples, min = 1, max = 2))
data = data.frame(round(matrix(runif(n.examples*n.features, min = 1, max =2),
  n.examples, n.features)))
data$y <- as.factor(labels)

```

```{r subset features}

#create a function to select the best features as per their correlation with the disease
best.subset <- function(data, n.features.select = 50){
  data$y <- as.numeric(data$y)
  correlations <- apply( data[,-which(names(data) == "y")] , 2 , cor , y = data$y )
  selected.features <- order(correlations, decreasing = TRUE)[1:n.features.select]
  selected.features <- names(correlations[selected.features])
  selected.data <- data[,c(selected.features,'y')]
  selected.data$y <- as.factor(selected.data$y)
  return(selected.data)
  }

selected.data <- best.subset(data, n.features.select)
```

```{r fit model to selected features. WRONG way of doing it}

# define training control
folds <- 5
fold.size <- dim(data)[1]/folds
train_control <- trainControl(method = "cv", number = folds)

# train the model on training set
model <- train(y ~ .,data = selected.data,
               trControl = train_control, method = 'naive_bayes')
# print(model)
sprintf('Classification accuracy when CV is performed after subset selection= %0.0f %%',
  100*model$results$Accuracy[1])

```
[1] "Classification accuracy when CV is performed after subset selection= 99%"

This is called cherry picking data. It is a completely inaccurate representation
of model error. For true representation of model accuracy, the model should not
"peek" into the validation set at all - which means feature selection must be
performed only after leaving out one fold of the data.
	```{r setup, include=FALSE}
	rm(list=ls())
	knitr::opts_chunk$set(echo = TRUE)
	library(caret)
	library(ggplot2)

	```

	# Cross validation Example: The wrong way and the right way

	```{r make dataset}

	set.seed(6)

	n.features = 1e5 #total features in data (genes)
	n.features.select = 1e2 #features to be selected after screening
	n.examples = 50 #number of examples (or patients)

	#create completely random labels of occurence of heart disease in patients
	labels = round(runif(n.examples, min = 1, max = 2))
	data = data.frame(round(matrix(runif(n.examples*n.features, min = 1, max =2),
	n.examples, n.features)))
	data$y <- as.factor(labels)

	```

	```{r subset features}

	#create a function to select the best features as per their correlation with the disease
	best.subset <- function(data, n.features.select = 50){
	data$y <- as.numeric(data$y)
	correlations <- apply( data[,-which(names(data) == "y")] , 2 , cor , y = data$y )
	selected.features <- order(correlations, decreasing = TRUE)[1:n.features.select]
	selected.features <- names(correlations[selected.features])
	selected.data <- data[,c(selected.features,'y')]
	selected.data$y <- as.factor(selected.data$y)
	return(selected.data)
	}

	selected.data <- best.subset(data, n.features.select)
	```

	```{r fit model to selected features. WRONG way of doing it}

	# define training control
	folds <- 5
	fold.size <- dim(data)[1]/folds
	train_control <- trainControl(method = "cv", number = folds)

	# train the model on training set
	model <- train(y ~ .,data = selected.data,
	trControl = train_control, method = 'naive_bayes')
	# print(model)
	sprintf('Classification accuracy when CV is performed after subset selection= %0.0f %%',
	100*model$results$Accuracy[1])

	```
	[1] "Classification accuracy when CV is performed after subset selection= 99%"

	This is called cherry picking data. It is a completely inaccurate representation
	of model error. For true representation of model accuracy, the model should not
	"peek" into the validation set at all - which means feature selection must be
	performed only after leaving out one fold of the data.