Skip to content

Instantly share code, notes, and snippets.

@sanoke
Last active December 2, 2020 17:01
Show Gist options
  • Save sanoke/f928d008c4a9fe35c9ba1d7a8a0dbacf to your computer and use it in GitHub Desktop.
Save sanoke/f928d008c4a9fe35c9ba1d7a8a0dbacf to your computer and use it in GitHub Desktop.
code from 'ML with caret in R' course on DataCamp
# # # # # # # # # # # # # # # # #
# Sarah Anoke
# sarah@sanoke.info
#
# Machine Learning with caret in R
# code from the eponymous DataCamp course
# by Zachary Deane-Mayer and Max Kuhn
# https://learn.datacamp.com/courses/machine-learning-with-caret-in-r
# # # # # # # # # # # # # # # # #
library(tidyverse) # 'diamonds' dataset contained in ggplot2
library(mlbench) # Boston housing dataset contained here
library(tidytext) # tokenization
library(caret) # for cross-validation + parameter-tuning
library(caTools) # for computing ROC curves
# - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 1 - - - - - - - - #
# - in-sample RMSE for linear regression - #
# - - - - - - - - - - - - - - - - - - - - - #
# fit a linear regression model
model <- lm(price ~ ., data = diamonds)
# generate (in-sample) predictions
p <- predict(model, newdata = diamonds)
# calculate prediction errors
error <- p - diamonds$price
# calculate RMSE (square root of the average squared error)
sqrt(mean(error^2)) # 1130
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 2 - - - - - - - #
# out-of-sample RMSE for linear regression #
# - - - - - - - - - - - - - - - - - - - - #
# set seed
set.seed(42)
# shuffle row indices
rows <- sample(nrow(diamonds))
# randomly order data
shuffled_diamonds <- diamonds[rows,]
# determine row to split on, for a 80/20 split
split <- round(nrow(diamonds)*0.80)
# create training DS using 80% of the full dataset
train <- diamonds[1:split,]
# create test DS with the remaining 20%
test <- diamonds[(split+1):nrow(diamonds),]
# fit a linear regr model using the training data
model <- lm(price ~ . ,train)
# generate predictions using the test data
p <- predict(model, test)
# calculate prediction errors based on the test data
error <- p - test$price
# calculate RMSE
# - test RMSE is higher than training RMSE because
# - model is (over)fit based on the training data
# - and the test set contains data the model hasn't seen before
sqrt(mean(error^2)) # 797
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 3 - - - - - - - - #
# - - 10-fold cross-validation for LR - - #
# - - - - - - - - - - - - - - - - - - - - #
# CV is advantageous b/c it gives multiple estimates of OOS error,
# rather than a single one
# fit linear regr model using 10-fold CV
model <- caret::train(
price ~ ., diamonds,
method = "lm",
trControl = trainControl(
method = "cv", number = 10,
verboseIter = TRUE
)
)
# print contents
model
## Linear Regression
## 53940 samples
## 9 predictor
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 48545, 48547, 48547, 48547, 48545, 48545, ...
## Resampling results:
## RMSE Rsquared MAE
## 1130.728 0.9196672 740.4929
## Tuning parameter 'intercept' was held constant at a value of TRUE
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 4 - - - - - - - - #
# - - 5-fold cross-validation for LR - - #
# - - - - - - - - - - - - - - - - - - - - #
# loading dataset
data(BostonHousing)
# changing the name to match what's used in DataCamp
Boston <- BostonHousing
# fit linear regr model using 5-fold CV
model <- train(
medv ~ ., Boston,
method = "lm",
trControl = trainControl(
method = "cv", number = 5,
verboseIter = TRUE
)
)
# print contents
model
## Linear Regression
## 506 samples
## 13 predictor
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 405, 405, 404, 406, 404
## Resampling results:
## RMSE Rsquared MAE
## 4.941179 0.724198 3.429735
## Tuning parameter 'intercept' was held constant at a value of TRUE
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 5 - - - - - - - - #
# - 5x5-fold cross-validation for LR - - #
# - - - - - - - - - - - - - - - - - - - - #
# an example of repeated cross-validation
# 5-fold CV repeated 5 times (5x5)
# - takes longer but produces more out-of-sample datasets
# - and much more precise assessments of how well the model
# - performs
# fit linear regr model using 5-fold CV
model <- train(
medv ~ ., Boston,
method = "lm",
trControl = trainControl(
method = "repeatedcv", number = 5,
repeats = 5, verboseIter = TRUE
)
)
# print contents
model
## Linear Regression
## 506 samples
## 13 predictor
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 406, 405, 404, 405, 404, 405, ...
## Resampling results:
## RMSE Rsquared MAE
## 4.852366 0.7213942 3.400954
## Tuning parameter 'intercept' was held constant at a value of TRUE
# generate predictions using the full Boston dataset
predict(model, Boston)
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 6 - - - - - - - - #
# - - 60/40 split for logistic regr - - #
# - - - - - - - - - - - - - - - - - - - - #
# shuffle row indices
n_obs <- nrow(Sonar)
permuted_rows <- sample(n_obs)
# randomly order data: Sonar
Sonar_shuffled <- Sonar[permuted_rows,]
# identify row to split on: split
split <- round(nrow(Sonar) * 0.60)
# create train
train <- Sonar_shuffled[1:split,]
# create test
test <- Sonar_shuffled[(split+1):nrow(Sonar),]
# fit logistic regression
model <- glm(Class ~ . , family = 'binomial' , train)
# Predict on test: p
p <- predict(model, test, type = 'response')
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 7 - - - - - - - - #
# - - - calculate a confusion matrix- - - #
# - - - - - - - - - - - - - - - - - - - - #
# confusion matrices are used to assess classification
# - accuracy. in this case, we're doing a binary classification
# - between 'M' and 'R'
# testing a threshold: 0.5
# - if p exceeds threshold of 0.5, M else R
m_or_r <- ifelse(p > 0.5, 'M', 'R')
# convert to factor
p_class <- factor(m_or_r, levels = levels(test[['Class']]))
# create confusion matrix
confusionMatrix(p_class, test[['Class']])
# testing a threshold: 0.9
# - good if we want to be very certain about what we classify as 'M'
# - (will get fewer predicted 'M'ines but greater confidence in each
# prediction)
# - if p exceeds threshold of 0.9, M else R
m_or_r <- ifelse(p > 0.9, 'M', 'R')
# convert to factor
p_class <- factor(m_or_r, levels(test[['Class']]))
# create confusion matrix
confusionMatrix(p_class, test$Class)
# testing a threshold: 0.1
# - good if we want to make sure we capture everything that could
# be a mine
# - if p exceeds threshold of 0.1, M else R
m_or_r <- ifelse(p > 0.1, 'M', 'R')
# convert to factor
p_class <- factor(m_or_r, levels(test[['Class']]))
# create confusion matrix
confusionMatrix(p_class, test$Class)
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 8 - - - - - - - - #
# - - - - - - - ROC curve - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - #
# - an ROC curve evaluates all possible thresholds for splitting predicted
# probabilities into predicted classes
# - it summarizes the performance of a classifier over all possible thresholds
# generate predictions on test data
p <- predict(model, test, type="response")
plot ROC curve
caTools::colAUC(p, test$Class, plotROC=TRUE)
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 9 - - - - - - - - #
# - - - - - - - - AUC - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - #
# AUC is a single-number summary of a model's ability to discriminate
# - the positive from the negative class
# * AUC of 0.5 is no better than random guessing
# * AUC of 1.0 is a perfectly-predictive model
# * AUC of 0.0 is perfectly anti-predictive (rare)
# create trainControl object
# - use AUC instead of accuracy, agnostic to model-fitting procedures
# - 10-fold CV
myControl <- trainControl(
method = "cv",
number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE,
verboseIter = TRUE
)
# train glm with custom trainControl
model <- train(Class ~ .,
Sonar,
method = 'glm',
trControl = myControl)
# print model to console
model
## Generalized Linear Model
## 208 samples
## 60 predictor
## 2 classes: 'M', 'R'
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 187, 188, 187, 188, 188, 187, ...
## Resampling results:
## ROC Sens Spec
## 0.7520455 0.7659091 0.6755556
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 10 - - - - - - - - #
# - - - - - - random forests - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - #
# random forests are more flexible than linear models
# fit a random forest
model <- train(
quality ~ .,
tuneLength = 1,
data = wine,
method = 'ranger', # ranger is a pkg that rewrites randomForest
trControl = trainControl(
method = "cv",
number = 5,
verboseIter = TRUE
)
)
# print model to console
model
## Random Forest
## 100 samples
## 12 predictor
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 79, 80, 80, 80, 81
## Resampling results across tuning parameters:
## splitrule RMSE Rsquared MAE
## variance 0.6444236 0.3115906 0.4849863
## extratrees 0.6674696 0.2769533 0.4989182
## Tuning parameter 'mtry' was held constant at a value of 3
## Tuning
## parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 3, splitrule = variance
## and min.node.size = 5.
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 11 - - - - - - - - #
# - - - - - - tuning models - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - #
# tuning grid is calibrated using the tuneLength argument.
# the larger, the more fine the grid (default is 3).
# fit random forest
model <- train(
quality ~ .,
tuneLength = 3,
data = wine,
method = "ranger",
trControl = trainControl(
method = "cv",
number = 5,
verboseIter = TRUE
)
)
# print model to console
model
## Random Forest
## 100 samples
## 12 predictor
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 80, 79, 79, 81, 81
## Resampling results across tuning parameters:
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.7830336 0.2515022 0.6161859
## 2 extratrees 0.7750128 0.2799630 0.6058953
## 7 variance 0.7807803 0.2383175 0.6247166
## 7 extratrees 0.7612100 0.2796607 0.5940507
## 12 variance 0.7831554 0.2313966 0.6312649
## 12 extratrees 0.7679918 0.2570055 0.5979680
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 7, splitrule = extratrees
## and min.node.size = 5.
# plot model
plot(model) # 1.png
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 12 - - - - - - - - #
# - - - - custom tuning grids - - - - - - #
# - - - - - - - - - - - - - - - - - - - - #
# more fine-grained control over the tuning parameters
# define the tuning grid
tuneGrid <- data.frame(
.mtry = c(2,3,7),
.splitrule = "variance",
.min.node.size = 5
)
# fit random forest
model <- train(
quality ~ .,
tuneGrid = tuneGrid,
data = wine,
method = 'ranger',
trControl = trainControl(
method = "cv",
number = 5,
verboseIter = TRUE
)
)
# print model to console
model
## Random Forest
## 100 samples
## 12 predictor
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 79, 80, 80, 80, 81
## Resampling results across tuning parameters:
## mtry RMSE Rsquared MAE
## 2 0.6525708 0.3008584 0.4888581
## 3 0.6402476 0.3258908 0.4790033
## 7 0.6404442 0.3142528 0.4760855
## Tuning parameter 'splitrule' was held constant at a value of variance
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 3, splitrule = variance
## and min.node.size = 5.
plot(model) # 2.png
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 13 - - - - - - - - #
# - - - - - - - - glmnet - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - #
# glmnet is better than glm in that it places constraints on
# - regression coefficients, which helps prevent overfitting
# create custom trainControl
myControl <- trainControl(
method = "cv",
number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE
)
# fit glmnet model
model <- train(
y ~ .,
overfit,
method = "glmnet",
trControl = myControl
)
# print model to console
model
## glmnet
## 250 samples
## 200 predictors
## 2 classes: 'class1', 'class2'
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 226, 225, 225, 224, 225, 225, ...
## Resampling results across tuning parameters:
## alpha lambda ROC Sens Spec
## 0.10 0.0001012745 0.4345109 0.05 0.9697464
## 0.10 0.0010127448 0.4365942 0.05 0.9740942
## 0.10 0.0101274483 0.4580616 0.05 0.9826087
## 0.55 0.0001012745 0.3893116 0.05 0.9568841
## 0.55 0.0010127448 0.3850543 0.05 0.9697464
## 0.55 0.0101274483 0.4280797 0.05 0.9826087
## 1.00 0.0001012745 0.3662138 0.05 0.9353261
## 1.00 0.0010127448 0.3747283 0.05 0.9481884
## 1.00 0.0101274483 0.4049819 0.00 0.9869565
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.1 and lambda = 0.01012745.
# print maximum ROC statistic
max(model$results$ROC)
## [1] 0.4580616
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 14 - - - - - - - - #
# - - - glmnet with tuning grid - - - - - #
# - - - - - - - - - - - - - - - - - - - - #
# the default tuning grid is very small
# - and there are many more potential glmnet models we'd
# - want to explore
# train glmnet with custom trainControl and tuning
model <- train(
y ~ .,
overfit,
tuneGrid = expand.grid(
alpha = 0:1, # 0 is pure ridge, 1 is pure lasso
lambda = seq(0.0001, 1, length = 20) # max lambda is 10
),
method = 'glmnet',
trControl = myControl
)
# print model to console
model
# print maximum ROC statistic
max(model$results$ROC) # 3.png
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 15 - - - - - - - - #
# - - - - median imputation - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - #
# apply median imputation (pre-processing step)
median_model <- train(
x = breast_cancer_x,
y = breast_cancer_y,
method = "glm",
trControl = myControl,
preProcess = "medianImpute"
)
# print median_model to console
median_model
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 16 - - - - - - - - #
# - - - - - - KNN imputation - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - #
# an alternative imputation method to try alongside
# - median imputation, then the method that produces
# - the most accurate method is chosen (e.g., the one
# - with the highest ROC metric)
# apply KNN imputation
knn_model <- train(
x = breast_cancer_x,
y = breast_cancer_y,
method = 'glm',
trControl = myControl,
preProcess = 'knnImpute'
)
# print knn_model to console
knn_model
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 17 - - - - - - - - #
# - - - multiple pre-processing steps - - #
# - - - - - - - - - - - - - - - - - - - - #
# in caret's preProcess(), median imputation comes before
# - centering and scaling of vars (i.e., standardization)
# fit glm with median imputation
model <- train(
x = breast_cancer_x,
y = breast_cancer_y,
method = "glm",
trControl = myControl,
preProcess = c('medianImpute','center','scale')
)
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 18 - - - - - - - - #
# - - - removing zero var predictor - - - #
# - - - - - - - - - - - - - - - - - - - - #
# removing near-zero variance predictors reduces model-fitting
# - time without reducing model accuracy.
# - also precludes some issues during CV where a predictor takes on
# - only one value within that fold
# identify near zero variance predictors
remove_cols <- nearZeroVar(bloodbrain_x, names = TRUE,
freqCut = 2, uniqueCut = 20)
# get all column names from bloodbrain_x
all_cols = names(bloodbrain_x)
# remove from data
bloodbrain_x_small <- bloodbrain_x[ , setdiff(all_cols, remove_cols)]
# fit model on reduced data
model <- train(
x = bloodbrain_x_small,
y = bloodbrain_y,
method = "glm"
)
# NOTE can also remove near/zero variance predictors using
# preProcess = 'nzv' within train()
# preProcess = 'zv' within train()
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 19 - - - - - - - - #
# - - - PCA as a preprocessing step - - - #
# - - - - - - - - - - - - - - - - - - - - #
# PCA can be used as a preprocessing step, as an alternative
# to removing low-variance predictors.
# fit glm model using PCA
model <- train(
x = bloodbrain_x,
y = bloodbrain_y,
method = 'glm',
preProcess = 'pca'
)
# - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - EXAMPLE 20 - - - - - - - - #
# - - - - running a full analysis! - - - - #
# - - - - - - - - - - - - - - - - - - - - #
# key here is the reuse of a trainControl(), which has several benefits
# - can use the same summaryFunction() and tuning parameters for multiple models
# - don't have to repeat code when fitting multiple models
# - can compare models on the exact same training and test data
# create custom indices
myFolds <- createFolds(churn_y, k = 5)
# create reusable trainControl object
myControl <- trainControl(
summaryFunction = twoClassSummary,
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE,
savePredictions = TRUE,
index = myFolds
)
# fit the baseline model w/ glmnet
model_glmnet <- train(
x = churn_x,
y = churn_y,
metric = "ROC",
method = 'glmnet',
trControl = myControl
)
# fit the baseline model w/ RF
# - slower to fit and less interpretable than glmnet
# - often more accurate than glmnet
# - easier to tune
# - require little preprocessing
# - capture threshold effects and variable interactions
model_rf <- train(
x = churn_x,
y = churn_y,
metric = "ROC",
method = 'ranger',
trControl = myControl
)
# compare the glmnet model and RF model using resamples()
# - first we have to make sure the models being compared were
# - fit on exactly the same training and test sets during CV
# - so that we're sure that we're making an apples-to-apples comparison
# - of their results.
# SELECTION CRITERIA
# - highest average AUC
# - low standard deviation in AUC
# create model_list
model_list <- list(item1 = model_glmnet, item2 = model_rf)
# pass model_list to resamples()
resamples <- resamples(model_list)
# summarize the results
summary(resamples)
# create bwplot (also part of summarization)
bwplot(resamples, metric = 'ROC') # 4.png
# create xyplot (also part of summarization)
xyplot(resamples, metric='ROC') # 5.png
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment