sanoke/ml-toolbox-course.R

## ml-toolbox-course.R
# # # # # # # # # # # # # # # # #
# Sarah Anoke
# sarah@sanoke.info
#
# Machine Learning with caret in R
#   code from the eponymous DataCamp course
#   by Zachary Deane-Mayer and Max Kuhn
#   https://learn.datacamp.com/courses/machine-learning-with-caret-in-r
# # # # # # # # # # # # # # # # #

library(tidyverse)  # 'diamonds' dataset contained in ggplot2
library(mlbench)    # Boston housing dataset contained here
library(tidytext)   # tokenization
library(caret)      # for cross-validation + parameter-tuning
library(caTools)    # for computing ROC curves

# - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - -  EXAMPLE 1  - - - - - - - - #
# - in-sample RMSE for linear regression  - #
# - - - - - - - - - - - - - - - - - - - - - #

# fit a linear regression model
model <- lm(price ~ ., data = diamonds)

# generate (in-sample) predictions
p <- predict(model, newdata = diamonds)

# calculate prediction errors
error <- p - diamonds$price

# calculate RMSE (square root of the average squared error)
sqrt(mean(error^2))   # 1130


# - - - - - - - - - - - - -  - - - - - - - #
# - - - - - - -  EXAMPLE 2   - - - - - - - #
# out-of-sample RMSE for linear regression #
# - - - - - - - - - - - - - -  - - - - - - #

# set seed
set.seed(42)

# shuffle row indices
rows <- sample(nrow(diamonds))

# randomly order data
shuffled_diamonds <- diamonds[rows,]

# determine row to split on, for a 80/20 split
split <- round(nrow(diamonds)*0.80)

# create training DS using 80% of the full dataset
train <- diamonds[1:split,]

# create test DS with the remaining 20%
test <- diamonds[(split+1):nrow(diamonds),]

# fit a linear regr model using the training data
model <- lm(price ~ . ,train)

# generate predictions using the test data
p <- predict(model, test)

# calculate prediction errors based on the test data
error <- p - test$price

# calculate RMSE
# - test RMSE is higher than training RMSE because
# - model is (over)fit based on the training data
# - and the test set contains data the model hasn't seen before
sqrt(mean(error^2))   # 797


# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 3 - - - - - - - - #
# - - 10-fold cross-validation for LR - - #
# - - - - - - - - - - - - - - - - - - - - #
# CV is advantageous b/c it gives multiple estimates of OOS error,
#   rather than a single one

# fit linear regr model using 10-fold CV
model <- caret::train(
  price ~ ., diamonds,
  method = "lm",
  trControl = trainControl(
    method = "cv", number = 10,
    verboseIter = TRUE
  )
)

# print contents
model
## Linear Regression

## 53940 samples
##     9 predictor

## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 48545, 48547, 48547, 48547, 48545, 48545, ...
## Resampling results:

##   RMSE      Rsquared   MAE
##   1130.728  0.9196672  740.4929

## Tuning parameter 'intercept' was held constant at a value of TRUE


# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 4 - - - - - - - - #
# - -  5-fold cross-validation for LR - - #
# - - - - - - - - - - - - - - - - - - - - #

# loading dataset
data(BostonHousing)

# changing the name to match what's used in DataCamp
Boston <- BostonHousing

# fit linear regr model using 5-fold CV
model <- train(
  medv ~ ., Boston,
  method = "lm",
  trControl = trainControl(
    method = "cv", number = 5,
    verboseIter = TRUE
  )
)

# print contents
model
## Linear Regression

## 506 samples
##  13 predictor

## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 405, 405, 404, 406, 404
## Resampling results:

##   RMSE      Rsquared  MAE
##   4.941179  0.724198  3.429735

## Tuning parameter 'intercept' was held constant at a value of TRUE


# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 5 - - - - - - - - #
# - 5x5-fold cross-validation for LR  - - #
# - - - - - - - - - - - - - - - - - - - - #
# an example of repeated cross-validation
# 5-fold CV repeated 5 times (5x5)
# - takes longer but produces more out-of-sample datasets
# - and much more precise assessments of how well the model
# - performs

# fit linear regr model using 5-fold CV
model <- train(
  medv ~ ., Boston,
  method = "lm",
  trControl = trainControl(
    method = "repeatedcv", number = 5,
    repeats = 5, verboseIter = TRUE
  )
)

# print contents
model
## Linear Regression

## 506 samples
##  13 predictor

## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 406, 405, 404, 405, 404, 405, ...
## Resampling results:

##   RMSE      Rsquared   MAE
##   4.852366  0.7213942  3.400954

## Tuning parameter 'intercept' was held constant at a value of TRUE

# generate predictions using the full Boston dataset
predict(model, Boston)


# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 6 - - - - - - - - #
# - -  60/40 split for logistic regr  - - #
# - - - - - - - - - - - - - - - - - - - - #

# shuffle row indices
n_obs <- nrow(Sonar)
permuted_rows <- sample(n_obs)

# randomly order data: Sonar
Sonar_shuffled <- Sonar[permuted_rows,]

# identify row to split on: split
split <- round(nrow(Sonar) * 0.60)

# create train
train <- Sonar_shuffled[1:split,]

# create test
test <- Sonar_shuffled[(split+1):nrow(Sonar),]

# fit logistic regression
model <- glm(Class ~ . , family = 'binomial' , train)

# Predict on test: p
p <- predict(model, test, type = 'response')


# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 7 - - - - - - - - #
# - - - calculate a confusion matrix- - - #
# - - - - - - - - - - - - - - - - - - - - #
# confusion matrices are used to assess classification
# - accuracy. in this case, we're doing a binary classification
# - between 'M' and 'R'

# testing a threshold: 0.5
# - if p exceeds threshold of 0.5, M else R
m_or_r <- ifelse(p > 0.5, 'M', 'R')

# convert to factor
p_class <- factor(m_or_r, levels = levels(test[['Class']]))

# create confusion matrix
confusionMatrix(p_class, test[['Class']])

# testing a threshold: 0.9
# - good if we want to be very certain about what we classify as 'M'
# - (will get fewer predicted 'M'ines but greater confidence in each
#    prediction)
# - if p exceeds threshold of 0.9, M else R
m_or_r <- ifelse(p > 0.9, 'M', 'R')

# convert to factor
p_class <- factor(m_or_r, levels(test[['Class']]))

# create confusion matrix
confusionMatrix(p_class, test$Class)

# testing a threshold: 0.1
# - good if we want to make sure we capture everything that could
#   be a mine
# - if p exceeds threshold of 0.1, M else R
m_or_r <- ifelse(p > 0.1, 'M', 'R')

# convert to factor
p_class <- factor(m_or_r, levels(test[['Class']]))

# create confusion matrix
confusionMatrix(p_class, test$Class)


# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 8 - - - - - - - - #
# - - - - - - - ROC curve - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - #
# - an ROC curve evaluates all possible thresholds for splitting predicted
#   probabilities into predicted classes
# - it summarizes the performance of a classifier over all possible thresholds

# generate predictions on test data
p <- predict(model, test, type="response")

plot ROC curve
caTools::colAUC(p, test$Class, plotROC=TRUE)


# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 9 - - - - - - - - #
# - - - - - - - - AUC - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - #
# AUC is a single-number summary of a model's ability to discriminate
# - the positive from the negative class
#   * AUC of 0.5 is no better than random guessing
#   * AUC of 1.0 is a perfectly-predictive model
#   * AUC of 0.0 is perfectly anti-predictive (rare)

# create trainControl object
# - use AUC instead of accuracy, agnostic to model-fitting procedures
# - 10-fold CV
myControl <- trainControl(
  method = "cv",
  number = 10,
  summaryFunction = twoClassSummary,
  classProbs = TRUE,
  verboseIter = TRUE
)

# train glm with custom trainControl
model <- train(Class ~ .,
               Sonar,
               method = 'glm',
               trControl = myControl)

# print model to console
model
## Generalized Linear Model

## 208 samples
##  60 predictor
##   2 classes: 'M', 'R'

## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 187, 188, 187, 188, 188, 187, ...
## Resampling results:

##   ROC        Sens       Spec
##   0.7520455  0.7659091  0.6755556


# - - - - - - - - - -  - - - - - - - - - - #
# - - - - - - - EXAMPLE 10 - - - - - - - - #
# - - - - - - random forests - - - - - - - #
# - - - - - - - - - -  - - - - - - - - - - #
# random forests are more flexible than linear models

# fit a random forest
model <- train(
  quality ~ .,
  tuneLength = 1,
  data = wine,
  method = 'ranger',   # ranger is a pkg that rewrites randomForest
  trControl = trainControl(
    method = "cv",
    number = 5,
    verboseIter = TRUE
  )
)

# print model to console
model
## Random Forest

## 100 samples
##  12 predictor

## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 79, 80, 80, 80, 81
## Resampling results across tuning parameters:

##   splitrule   RMSE       Rsquared   MAE
##   variance    0.6444236  0.3115906  0.4849863
##   extratrees  0.6674696  0.2769533  0.4989182

## Tuning parameter 'mtry' was held constant at a value of 3
## Tuning
##  parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 3, splitrule = variance
##  and min.node.size = 5.


# - - - - - - - - - -  - - - - - - - - - - #
# - - - - - - - EXAMPLE 11 - - - - - - - - #
# - - - - - - tuning models  - - - - - - - #
# - - - - - - - - - -  - - - - - - - - - - #
# tuning grid is calibrated using the tuneLength argument.
# the larger, the more fine the grid (default is 3).

# fit random forest
model <- train(
  quality  ~ .,
  tuneLength = 3,
  data = wine,
  method = "ranger",
  trControl = trainControl(
    method = "cv",
    number = 5,
    verboseIter = TRUE
  )
)

# print model to console
model

## Random Forest

## 100 samples
##  12 predictor

## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 80, 79, 79, 81, 81
## Resampling results across tuning parameters:

##   mtry  splitrule   RMSE       Rsquared   MAE
##    2    variance    0.7830336  0.2515022  0.6161859
##    2    extratrees  0.7750128  0.2799630  0.6058953
##    7    variance    0.7807803  0.2383175  0.6247166
##    7    extratrees  0.7612100  0.2796607  0.5940507
##   12    variance    0.7831554  0.2313966  0.6312649
##   12    extratrees  0.7679918  0.2570055  0.5979680

## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 7, splitrule = extratrees
##  and min.node.size = 5.

# plot model
plot(model)    # 1.png


# - - - - - - - - - -  - - - - - - - - - - #
# - - - - - - - EXAMPLE 12 - - - - - - - - #
# - - - -  custom tuning grids - - - - - - #
# - - - - - - - - - -  - - - - - - - - - - #
# more fine-grained control over the tuning parameters

# define the tuning grid
tuneGrid <- data.frame(
  .mtry = c(2,3,7),
  .splitrule = "variance",
  .min.node.size = 5
)

# fit random forest
model <- train(
  quality ~ .,
  tuneGrid = tuneGrid,
  data = wine,
  method = 'ranger',
  trControl = trainControl(
    method = "cv",
    number = 5,
    verboseIter = TRUE
  )
)

# print model to console
model
## Random Forest

## 100 samples
##  12 predictor

## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 79, 80, 80, 80, 81
## Resampling results across tuning parameters:

##   mtry  RMSE       Rsquared   MAE
##   2     0.6525708  0.3008584  0.4888581
##   3     0.6402476  0.3258908  0.4790033
##   7     0.6404442  0.3142528  0.4760855

## Tuning parameter 'splitrule' was held constant at a value of variance

## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 3, splitrule = variance
##  and min.node.size = 5.

plot(model)    # 2.png


# - - - - - - - - - -  - - - - - - - - - - #
# - - - - - - - EXAMPLE 13 - - - - - - - - #
# - - - - - - - - glmnet - - - - - - - - - #
# - - - - - - - - - -  - - - - - - - - - - #
# glmnet is better than glm in that it places constraints on
# - regression coefficients, which helps prevent overfitting

# create custom trainControl
myControl <- trainControl(
  method = "cv",
  number = 10,
  summaryFunction = twoClassSummary,
  classProbs = TRUE, # IMPORTANT!
  verboseIter = TRUE
)

# fit glmnet model
model <- train(
  y ~ .,
  overfit,
  method = "glmnet",
  trControl = myControl
)

# print model to console
model
## glmnet

## 250 samples
## 200 predictors
##   2 classes: 'class1', 'class2'

## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 226, 225, 225, 224, 225, 225, ...
## Resampling results across tuning parameters:

##   alpha  lambda        ROC        Sens  Spec
##   0.10   0.0001012745  0.4345109  0.05  0.9697464
##   0.10   0.0010127448  0.4365942  0.05  0.9740942
##   0.10   0.0101274483  0.4580616  0.05  0.9826087
##   0.55   0.0001012745  0.3893116  0.05  0.9568841
##   0.55   0.0010127448  0.3850543  0.05  0.9697464
##   0.55   0.0101274483  0.4280797  0.05  0.9826087
##   1.00   0.0001012745  0.3662138  0.05  0.9353261
##   1.00   0.0010127448  0.3747283  0.05  0.9481884
##   1.00   0.0101274483  0.4049819  0.00  0.9869565

## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.1 and lambda = 0.01012745.

# print maximum ROC statistic
max(model$results$ROC)
## [1] 0.4580616


# - - - - - - - - - -  - - - - - - - - - - #
# - - - - - - - EXAMPLE 14 - - - - - - - - #
# - - -  glmnet with tuning grid - - - - - #
# - - - - - - - - - -  - - - - - - - - - - #
# the default tuning grid is very small
# - and there are many more potential glmnet models we'd
# - want to explore

# train glmnet with custom trainControl and tuning
model <- train(
  y ~ .,
  overfit,
  tuneGrid = expand.grid(
  alpha = 0:1,          # 0 is pure ridge, 1 is pure lasso
  lambda = seq(0.0001, 1, length = 20)  # max lambda is 10
),
  method = 'glmnet',
  trControl = myControl
)

# print model to console
model

# print maximum ROC statistic
max(model$results$ROC)               # 3.png


# - - - - - - - - - -  - - - - - - - - - - #
# - - - - - - - EXAMPLE 15 - - - - - - - - #
# - - - -  median imputation - - - - - - - #
# - - - - - - - - - -  - - - - - - - - - - #

# apply median imputation (pre-processing step)
median_model <- train(
  x = breast_cancer_x,
  y = breast_cancer_y,
  method = "glm",
  trControl = myControl,
  preProcess = "medianImpute"
)

# print median_model to console
median_model


# - - - - - - - - - -  - - - - - - - - - - #
# - - - - - - - EXAMPLE 16 - - - - - - - - #
# - - - - - - KNN imputation - - - - - - - #
# - - - - - - - - - -  - - - - - - - - - - #
# an alternative imputation method to try alongside
# - median imputation, then the method that produces
# - the most accurate method is chosen (e.g., the one
# - with the highest ROC metric)

# apply KNN imputation
knn_model <- train(
  x = breast_cancer_x,
  y = breast_cancer_y,
  method = 'glm',
  trControl = myControl,
  preProcess = 'knnImpute'
)

# print knn_model to console
knn_model


# - - - - - - - - - -  - - - - - - - - - - #
# - - - - - - - EXAMPLE 17 - - - - - - - - #
# - - - multiple pre-processing steps  - - #
# - - - - - - - - - -  - - - - - - - - - - #
# in caret's preProcess(), median imputation comes before
# - centering and scaling of vars (i.e., standardization)

# fit glm with median imputation
model <- train(
  x = breast_cancer_x,
  y = breast_cancer_y,
  method = "glm",
  trControl = myControl,
  preProcess = c('medianImpute','center','scale')
)


# - - - - - - - - - -  - - - - - - - - - - #
# - - - - - - - EXAMPLE 18 - - - - - - - - #
# - - - removing zero var predictor  - - - #
# - - - - - - - - - -  - - - - - - - - - - #
# removing near-zero variance predictors reduces model-fitting
# - time without reducing model accuracy.
# - also precludes some issues during CV where a predictor takes on
# - only one value within that fold

# identify near zero variance predictors
remove_cols <- nearZeroVar(bloodbrain_x, names = TRUE,
                           freqCut = 2, uniqueCut = 20)

# get all column names from bloodbrain_x
all_cols = names(bloodbrain_x)

# remove from data
bloodbrain_x_small <- bloodbrain_x[ , setdiff(all_cols, remove_cols)]

# fit model on reduced data
model <- train(
  x = bloodbrain_x_small,
  y = bloodbrain_y,
  method = "glm"
)

# NOTE can also remove near/zero variance predictors using
#      preProcess = 'nzv' within train()
#      preProcess = 'zv' within train()


# - - - - - - - - - -  - - - - - - - - - - #
# - - - - - - - EXAMPLE 19 - - - - - - - - #
# - - - PCA as a preprocessing step  - - - #
# - - - - - - - - - -  - - - - - - - - - - #
# PCA can be used as a preprocessing step, as an alternative
# to removing low-variance predictors.

# fit glm model using PCA
model <- train(
  x = bloodbrain_x,
  y = bloodbrain_y,
  method = 'glm',
  preProcess = 'pca'
)


# - - - - - - - - - -  - - - - - - - - - - #
# - - - - - - - EXAMPLE 20 - - - - - - - - #
# - - - - running a full analysis! - - - - #
# - - - - - - - - - -  - - - - - - - - - - #

# key here is the reuse of a trainControl(), which has several benefits
# - can use the same summaryFunction() and tuning parameters for multiple models
# - don't have to repeat code when fitting multiple models
# - can compare models on the exact same training and test data

# create custom indices
myFolds <- createFolds(churn_y, k = 5)

# create reusable trainControl object
myControl <- trainControl(
  summaryFunction = twoClassSummary,
  classProbs = TRUE, # IMPORTANT!
  verboseIter = TRUE,
  savePredictions = TRUE,
  index = myFolds
)

# fit the baseline model w/ glmnet
model_glmnet <- train(
  x = churn_x,
  y = churn_y,
  metric = "ROC",
  method = 'glmnet',
  trControl = myControl
)

# fit the baseline model w/ RF
# - slower to fit and less interpretable than glmnet
# - often more accurate than glmnet
# - easier to tune
# - require little preprocessing
# - capture threshold effects and variable interactions
model_rf <- train(
  x = churn_x,
  y = churn_y,
  metric = "ROC",
  method = 'ranger',
  trControl = myControl
)

# compare the glmnet model and RF model using resamples()
# - first we have to make sure the models being compared were
# - fit on exactly the same training and test sets during CV
# - so that we're sure that we're making an apples-to-apples comparison
# - of their results.
# SELECTION CRITERIA
# - highest average AUC
# - low standard deviation in AUC

# create model_list
model_list <- list(item1 = model_glmnet, item2 = model_rf)

# pass model_list to resamples()
resamples <- resamples(model_list)

# summarize the results
summary(resamples)

# create bwplot (also part of summarization)
bwplot(resamples, metric = 'ROC')    # 4.png
# create xyplot (also part of summarization)
xyplot(resamples, metric='ROC')      # 5.png
	# # # # # # # # # # # # # # # # #
	# Sarah Anoke
	# sarah@sanoke.info
	#
	# Machine Learning with caret in R
	# code from the eponymous DataCamp course
	# by Zachary Deane-Mayer and Max Kuhn
	# https://learn.datacamp.com/courses/machine-learning-with-caret-in-r
	# # # # # # # # # # # # # # # # #

	library(tidyverse) # 'diamonds' dataset contained in ggplot2
	library(mlbench) # Boston housing dataset contained here
	library(tidytext) # tokenization
	library(caret) # for cross-validation + parameter-tuning
	library(caTools) # for computing ROC curves

	# - - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 1 - - - - - - - - #
	# - in-sample RMSE for linear regression - #
	# - - - - - - - - - - - - - - - - - - - - - #

	# fit a linear regression model
	model <- lm(price ~ ., data = diamonds)

	# generate (in-sample) predictions
	p <- predict(model, newdata = diamonds)

	# calculate prediction errors
	error <- p - diamonds$price

	# calculate RMSE (square root of the average squared error)
	sqrt(mean(error^2)) # 1130


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 2 - - - - - - - #
	# out-of-sample RMSE for linear regression #
	# - - - - - - - - - - - - - - - - - - - - #

	# set seed
	set.seed(42)

	# shuffle row indices
	rows <- sample(nrow(diamonds))

	# randomly order data
	shuffled_diamonds <- diamonds[rows,]

	# determine row to split on, for a 80/20 split
	split <- round(nrow(diamonds)*0.80)

	# create training DS using 80% of the full dataset
	train <- diamonds[1:split,]

	# create test DS with the remaining 20%
	test <- diamonds[(split+1):nrow(diamonds),]

	# fit a linear regr model using the training data
	model <- lm(price ~ . ,train)

	# generate predictions using the test data
	p <- predict(model, test)

	# calculate prediction errors based on the test data
	error <- p - test$price

	# calculate RMSE
	# - test RMSE is higher than training RMSE because
	# - model is (over)fit based on the training data
	# - and the test set contains data the model hasn't seen before
	sqrt(mean(error^2)) # 797


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 3 - - - - - - - - #
	# - - 10-fold cross-validation for LR - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# CV is advantageous b/c it gives multiple estimates of OOS error,
	# rather than a single one

	# fit linear regr model using 10-fold CV
	model <- caret::train(
	price ~ ., diamonds,
	method = "lm",
	trControl = trainControl(
	method = "cv", number = 10,
	verboseIter = TRUE
	)
	)

	# print contents
	model
	## Linear Regression

	## 53940 samples
	## 9 predictor

	## No pre-processing
	## Resampling: Cross-Validated (10 fold)
	## Summary of sample sizes: 48545, 48547, 48547, 48547, 48545, 48545, ...
	## Resampling results:

	## RMSE Rsquared MAE
	## 1130.728 0.9196672 740.4929

	## Tuning parameter 'intercept' was held constant at a value of TRUE


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 4 - - - - - - - - #
	# - - 5-fold cross-validation for LR - - #
	# - - - - - - - - - - - - - - - - - - - - #

	# loading dataset
	data(BostonHousing)

	# changing the name to match what's used in DataCamp
	Boston <- BostonHousing

	# fit linear regr model using 5-fold CV
	model <- train(
	medv ~ ., Boston,
	method = "lm",
	trControl = trainControl(
	method = "cv", number = 5,
	verboseIter = TRUE
	)
	)

	# print contents
	model
	## Linear Regression

	## 506 samples
	## 13 predictor

	## No pre-processing
	## Resampling: Cross-Validated (5 fold)
	## Summary of sample sizes: 405, 405, 404, 406, 404
	## Resampling results:

	## RMSE Rsquared MAE
	## 4.941179 0.724198 3.429735

	## Tuning parameter 'intercept' was held constant at a value of TRUE


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 5 - - - - - - - - #
	# - 5x5-fold cross-validation for LR - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# an example of repeated cross-validation
	# 5-fold CV repeated 5 times (5x5)
	# - takes longer but produces more out-of-sample datasets
	# - and much more precise assessments of how well the model
	# - performs

	# fit linear regr model using 5-fold CV
	model <- train(
	medv ~ ., Boston,
	method = "lm",
	trControl = trainControl(
	method = "repeatedcv", number = 5,
	repeats = 5, verboseIter = TRUE
	)
	)

	# print contents
	model
	## Linear Regression

	## 506 samples
	## 13 predictor

	## No pre-processing
	## Resampling: Cross-Validated (5 fold, repeated 5 times)
	## Summary of sample sizes: 406, 405, 404, 405, 404, 405, ...
	## Resampling results:

	## RMSE Rsquared MAE
	## 4.852366 0.7213942 3.400954

	## Tuning parameter 'intercept' was held constant at a value of TRUE

	# generate predictions using the full Boston dataset
	predict(model, Boston)


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 6 - - - - - - - - #
	# - - 60/40 split for logistic regr - - #
	# - - - - - - - - - - - - - - - - - - - - #

	# shuffle row indices
	n_obs <- nrow(Sonar)
	permuted_rows <- sample(n_obs)

	# randomly order data: Sonar
	Sonar_shuffled <- Sonar[permuted_rows,]

	# identify row to split on: split
	split <- round(nrow(Sonar) * 0.60)

	# create train
	train <- Sonar_shuffled[1:split,]

	# create test
	test <- Sonar_shuffled[(split+1):nrow(Sonar),]

	# fit logistic regression
	model <- glm(Class ~ . , family = 'binomial' , train)

	# Predict on test: p
	p <- predict(model, test, type = 'response')


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 7 - - - - - - - - #
	# - - - calculate a confusion matrix- - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# confusion matrices are used to assess classification
	# - accuracy. in this case, we're doing a binary classification
	# - between 'M' and 'R'

	# testing a threshold: 0.5
	# - if p exceeds threshold of 0.5, M else R
	m_or_r <- ifelse(p > 0.5, 'M', 'R')

	# convert to factor
	p_class <- factor(m_or_r, levels = levels(test[['Class']]))

	# create confusion matrix
	confusionMatrix(p_class, test[['Class']])

	# testing a threshold: 0.9
	# - good if we want to be very certain about what we classify as 'M'
	# - (will get fewer predicted 'M'ines but greater confidence in each
	# prediction)
	# - if p exceeds threshold of 0.9, M else R
	m_or_r <- ifelse(p > 0.9, 'M', 'R')

	# convert to factor
	p_class <- factor(m_or_r, levels(test[['Class']]))

	# create confusion matrix
	confusionMatrix(p_class, test$Class)

	# testing a threshold: 0.1
	# - good if we want to make sure we capture everything that could
	# be a mine
	# - if p exceeds threshold of 0.1, M else R
	m_or_r <- ifelse(p > 0.1, 'M', 'R')

	# convert to factor
	p_class <- factor(m_or_r, levels(test[['Class']]))

	# create confusion matrix
	confusionMatrix(p_class, test$Class)


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 8 - - - - - - - - #
	# - - - - - - - ROC curve - - - - - - - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# - an ROC curve evaluates all possible thresholds for splitting predicted
	# probabilities into predicted classes
	# - it summarizes the performance of a classifier over all possible thresholds

	# generate predictions on test data
	p <- predict(model, test, type="response")

	plot ROC curve
	caTools::colAUC(p, test$Class, plotROC=TRUE)


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 9 - - - - - - - - #
	# - - - - - - - - AUC - - - - - - - - - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# AUC is a single-number summary of a model's ability to discriminate
	# - the positive from the negative class
	# * AUC of 0.5 is no better than random guessing
	# * AUC of 1.0 is a perfectly-predictive model
	# * AUC of 0.0 is perfectly anti-predictive (rare)

	# create trainControl object
	# - use AUC instead of accuracy, agnostic to model-fitting procedures
	# - 10-fold CV
	myControl <- trainControl(
	method = "cv",
	number = 10,
	summaryFunction = twoClassSummary,
	classProbs = TRUE,
	verboseIter = TRUE
	)

	# train glm with custom trainControl
	model <- train(Class ~ .,
	Sonar,
	method = 'glm',
	trControl = myControl)

	# print model to console
	model
	## Generalized Linear Model

	## 208 samples
	## 60 predictor
	## 2 classes: 'M', 'R'

	## No pre-processing
	## Resampling: Cross-Validated (10 fold)
	## Summary of sample sizes: 187, 188, 187, 188, 188, 187, ...
	## Resampling results:

	## ROC Sens Spec
	## 0.7520455 0.7659091 0.6755556


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 10 - - - - - - - - #
	# - - - - - - random forests - - - - - - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# random forests are more flexible than linear models

	# fit a random forest
	model <- train(
	quality ~ .,
	tuneLength = 1,
	data = wine,
	method = 'ranger', # ranger is a pkg that rewrites randomForest
	trControl = trainControl(
	method = "cv",
	number = 5,
	verboseIter = TRUE
	)
	)

	# print model to console
	model
	## Random Forest

	## 100 samples
	## 12 predictor

	## No pre-processing
	## Resampling: Cross-Validated (5 fold)
	## Summary of sample sizes: 79, 80, 80, 80, 81
	## Resampling results across tuning parameters:

	## splitrule RMSE Rsquared MAE
	## variance 0.6444236 0.3115906 0.4849863
	## extratrees 0.6674696 0.2769533 0.4989182

	## Tuning parameter 'mtry' was held constant at a value of 3
	## Tuning
	## parameter 'min.node.size' was held constant at a value of 5
	## RMSE was used to select the optimal model using the smallest value.
	## The final values used for the model were mtry = 3, splitrule = variance
	## and min.node.size = 5.


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 11 - - - - - - - - #
	# - - - - - - tuning models - - - - - - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# tuning grid is calibrated using the tuneLength argument.
	# the larger, the more fine the grid (default is 3).

	# fit random forest
	model <- train(
	quality ~ .,
	tuneLength = 3,
	data = wine,
	method = "ranger",
	trControl = trainControl(
	method = "cv",
	number = 5,
	verboseIter = TRUE
	)
	)

	# print model to console
	model

	## Random Forest

	## 100 samples
	## 12 predictor

	## No pre-processing
	## Resampling: Cross-Validated (5 fold)
	## Summary of sample sizes: 80, 79, 79, 81, 81
	## Resampling results across tuning parameters:

	## mtry splitrule RMSE Rsquared MAE
	## 2 variance 0.7830336 0.2515022 0.6161859
	## 2 extratrees 0.7750128 0.2799630 0.6058953
	## 7 variance 0.7807803 0.2383175 0.6247166
	## 7 extratrees 0.7612100 0.2796607 0.5940507
	## 12 variance 0.7831554 0.2313966 0.6312649
	## 12 extratrees 0.7679918 0.2570055 0.5979680

	## Tuning parameter 'min.node.size' was held constant at a value of 5
	## RMSE was used to select the optimal model using the smallest value.
	## The final values used for the model were mtry = 7, splitrule = extratrees
	## and min.node.size = 5.

	# plot model
	plot(model) # 1.png


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 12 - - - - - - - - #
	# - - - - custom tuning grids - - - - - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# more fine-grained control over the tuning parameters

	# define the tuning grid
	tuneGrid <- data.frame(
	.mtry = c(2,3,7),
	.splitrule = "variance",
	.min.node.size = 5
	)

	# fit random forest
	model <- train(
	quality ~ .,
	tuneGrid = tuneGrid,
	data = wine,
	method = 'ranger',
	trControl = trainControl(
	method = "cv",
	number = 5,
	verboseIter = TRUE
	)
	)

	# print model to console
	model
	## Random Forest

	## 100 samples
	## 12 predictor

	## No pre-processing
	## Resampling: Cross-Validated (5 fold)
	## Summary of sample sizes: 79, 80, 80, 80, 81
	## Resampling results across tuning parameters:

	## mtry RMSE Rsquared MAE
	## 2 0.6525708 0.3008584 0.4888581
	## 3 0.6402476 0.3258908 0.4790033
	## 7 0.6404442 0.3142528 0.4760855

	## Tuning parameter 'splitrule' was held constant at a value of variance

	## Tuning parameter 'min.node.size' was held constant at a value of 5
	## RMSE was used to select the optimal model using the smallest value.
	## The final values used for the model were mtry = 3, splitrule = variance
	## and min.node.size = 5.

	plot(model) # 2.png


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 13 - - - - - - - - #
	# - - - - - - - - glmnet - - - - - - - - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# glmnet is better than glm in that it places constraints on
	# - regression coefficients, which helps prevent overfitting

	# create custom trainControl
	myControl <- trainControl(
	method = "cv",
	number = 10,
	summaryFunction = twoClassSummary,
	classProbs = TRUE, # IMPORTANT!
	verboseIter = TRUE
	)

	# fit glmnet model
	model <- train(
	y ~ .,
	overfit,
	method = "glmnet",
	trControl = myControl
	)

	# print model to console
	model
	## glmnet

	## 250 samples
	## 200 predictors
	## 2 classes: 'class1', 'class2'

	## No pre-processing
	## Resampling: Cross-Validated (10 fold)
	## Summary of sample sizes: 226, 225, 225, 224, 225, 225, ...
	## Resampling results across tuning parameters:

	## alpha lambda ROC Sens Spec
	## 0.10 0.0001012745 0.4345109 0.05 0.9697464
	## 0.10 0.0010127448 0.4365942 0.05 0.9740942
	## 0.10 0.0101274483 0.4580616 0.05 0.9826087
	## 0.55 0.0001012745 0.3893116 0.05 0.9568841
	## 0.55 0.0010127448 0.3850543 0.05 0.9697464
	## 0.55 0.0101274483 0.4280797 0.05 0.9826087
	## 1.00 0.0001012745 0.3662138 0.05 0.9353261
	## 1.00 0.0010127448 0.3747283 0.05 0.9481884
	## 1.00 0.0101274483 0.4049819 0.00 0.9869565

	## ROC was used to select the optimal model using the largest value.
	## The final values used for the model were alpha = 0.1 and lambda = 0.01012745.

	# print maximum ROC statistic
	max(model$results$ROC)
	## [1] 0.4580616


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 14 - - - - - - - - #
	# - - - glmnet with tuning grid - - - - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# the default tuning grid is very small
	# - and there are many more potential glmnet models we'd
	# - want to explore

	# train glmnet with custom trainControl and tuning
	model <- train(
	y ~ .,
	overfit,
	tuneGrid = expand.grid(
	alpha = 0:1, # 0 is pure ridge, 1 is pure lasso
	lambda = seq(0.0001, 1, length = 20) # max lambda is 10
	),
	method = 'glmnet',
	trControl = myControl
	)

	# print model to console
	model

	# print maximum ROC statistic
	max(model$results$ROC) # 3.png


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 15 - - - - - - - - #
	# - - - - median imputation - - - - - - - #
	# - - - - - - - - - - - - - - - - - - - - #

	# apply median imputation (pre-processing step)
	median_model <- train(
	x = breast_cancer_x,
	y = breast_cancer_y,
	method = "glm",
	trControl = myControl,
	preProcess = "medianImpute"
	)

	# print median_model to console
	median_model


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 16 - - - - - - - - #
	# - - - - - - KNN imputation - - - - - - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# an alternative imputation method to try alongside
	# - median imputation, then the method that produces
	# - the most accurate method is chosen (e.g., the one
	# - with the highest ROC metric)

	# apply KNN imputation
	knn_model <- train(
	x = breast_cancer_x,
	y = breast_cancer_y,
	method = 'glm',
	trControl = myControl,
	preProcess = 'knnImpute'
	)

	# print knn_model to console
	knn_model


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 17 - - - - - - - - #
	# - - - multiple pre-processing steps - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# in caret's preProcess(), median imputation comes before
	# - centering and scaling of vars (i.e., standardization)

	# fit glm with median imputation
	model <- train(
	x = breast_cancer_x,
	y = breast_cancer_y,
	method = "glm",
	trControl = myControl,
	preProcess = c('medianImpute','center','scale')
	)


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 18 - - - - - - - - #
	# - - - removing zero var predictor - - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# removing near-zero variance predictors reduces model-fitting
	# - time without reducing model accuracy.
	# - also precludes some issues during CV where a predictor takes on
	# - only one value within that fold

	# identify near zero variance predictors
	remove_cols <- nearZeroVar(bloodbrain_x, names = TRUE,
	freqCut = 2, uniqueCut = 20)

	# get all column names from bloodbrain_x
	all_cols = names(bloodbrain_x)

	# remove from data
	bloodbrain_x_small <- bloodbrain_x[ , setdiff(all_cols, remove_cols)]

	# fit model on reduced data
	model <- train(
	x = bloodbrain_x_small,
	y = bloodbrain_y,
	method = "glm"
	)

	# NOTE can also remove near/zero variance predictors using
	# preProcess = 'nzv' within train()
	# preProcess = 'zv' within train()


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 19 - - - - - - - - #
	# - - - PCA as a preprocessing step - - - #
	# - - - - - - - - - - - - - - - - - - - - #
	# PCA can be used as a preprocessing step, as an alternative
	# to removing low-variance predictors.

	# fit glm model using PCA
	model <- train(
	x = bloodbrain_x,
	y = bloodbrain_y,
	method = 'glm',
	preProcess = 'pca'
	)


	# - - - - - - - - - - - - - - - - - - - - #
	# - - - - - - - EXAMPLE 20 - - - - - - - - #
	# - - - - running a full analysis! - - - - #
	# - - - - - - - - - - - - - - - - - - - - #

	# key here is the reuse of a trainControl(), which has several benefits
	# - can use the same summaryFunction() and tuning parameters for multiple models
	# - don't have to repeat code when fitting multiple models
	# - can compare models on the exact same training and test data

	# create custom indices
	myFolds <- createFolds(churn_y, k = 5)

	# create reusable trainControl object
	myControl <- trainControl(
	summaryFunction = twoClassSummary,
	classProbs = TRUE, # IMPORTANT!
	verboseIter = TRUE,
	savePredictions = TRUE,
	index = myFolds
	)

	# fit the baseline model w/ glmnet
	model_glmnet <- train(
	x = churn_x,
	y = churn_y,
	metric = "ROC",
	method = 'glmnet',
	trControl = myControl
	)

	# fit the baseline model w/ RF
	# - slower to fit and less interpretable than glmnet
	# - often more accurate than glmnet
	# - easier to tune
	# - require little preprocessing
	# - capture threshold effects and variable interactions
	model_rf <- train(
	x = churn_x,
	y = churn_y,
	metric = "ROC",
	method = 'ranger',
	trControl = myControl
	)

	# compare the glmnet model and RF model using resamples()
	# - first we have to make sure the models being compared were
	# - fit on exactly the same training and test sets during CV
	# - so that we're sure that we're making an apples-to-apples comparison
	# - of their results.
	# SELECTION CRITERIA
	# - highest average AUC
	# - low standard deviation in AUC

	# create model_list
	model_list <- list(item1 = model_glmnet, item2 = model_rf)

	# pass model_list to resamples()
	resamples <- resamples(model_list)

	# summarize the results
	summary(resamples)

	# create bwplot (also part of summarization)
	bwplot(resamples, metric = 'ROC') # 4.png
	# create xyplot (also part of summarization)
	xyplot(resamples, metric='ROC') # 5.png