Last active
December 2, 2020 17:01
-
-
Save sanoke/f928d008c4a9fe35c9ba1d7a8a0dbacf to your computer and use it in GitHub Desktop.
code from 'ML with caret in R' course on DataCamp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# # # # # # # # # # # # # # # # # | |
# Sarah Anoke | |
# sarah@sanoke.info | |
# | |
# Machine Learning with caret in R | |
# code from the eponymous DataCamp course | |
# by Zachary Deane-Mayer and Max Kuhn | |
# https://learn.datacamp.com/courses/machine-learning-with-caret-in-r | |
# # # # # # # # # # # # # # # # # | |
library(tidyverse) # 'diamonds' dataset contained in ggplot2 | |
library(mlbench) # Boston housing dataset contained here | |
library(tidytext) # tokenization | |
library(caret) # for cross-validation + parameter-tuning | |
library(caTools) # for computing ROC curves | |
# - - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 1 - - - - - - - - # | |
# - in-sample RMSE for linear regression - # | |
# - - - - - - - - - - - - - - - - - - - - - # | |
# fit a linear regression model | |
model <- lm(price ~ ., data = diamonds) | |
# generate (in-sample) predictions | |
p <- predict(model, newdata = diamonds) | |
# calculate prediction errors | |
error <- p - diamonds$price | |
# calculate RMSE (square root of the average squared error) | |
sqrt(mean(error^2)) # 1130 | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 2 - - - - - - - # | |
# out-of-sample RMSE for linear regression # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# set seed | |
set.seed(42) | |
# shuffle row indices | |
rows <- sample(nrow(diamonds)) | |
# randomly order data | |
shuffled_diamonds <- diamonds[rows,] | |
# determine row to split on, for a 80/20 split | |
split <- round(nrow(diamonds)*0.80) | |
# create training DS using 80% of the full dataset | |
train <- diamonds[1:split,] | |
# create test DS with the remaining 20% | |
test <- diamonds[(split+1):nrow(diamonds),] | |
# fit a linear regr model using the training data | |
model <- lm(price ~ . ,train) | |
# generate predictions using the test data | |
p <- predict(model, test) | |
# calculate prediction errors based on the test data | |
error <- p - test$price | |
# calculate RMSE | |
# - test RMSE is higher than training RMSE because | |
# - model is (over)fit based on the training data | |
# - and the test set contains data the model hasn't seen before | |
sqrt(mean(error^2)) # 797 | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 3 - - - - - - - - # | |
# - - 10-fold cross-validation for LR - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# CV is advantageous b/c it gives multiple estimates of OOS error, | |
# rather than a single one | |
# fit linear regr model using 10-fold CV | |
model <- caret::train( | |
price ~ ., diamonds, | |
method = "lm", | |
trControl = trainControl( | |
method = "cv", number = 10, | |
verboseIter = TRUE | |
) | |
) | |
# print contents | |
model | |
## Linear Regression | |
## 53940 samples | |
## 9 predictor | |
## No pre-processing | |
## Resampling: Cross-Validated (10 fold) | |
## Summary of sample sizes: 48545, 48547, 48547, 48547, 48545, 48545, ... | |
## Resampling results: | |
## RMSE Rsquared MAE | |
## 1130.728 0.9196672 740.4929 | |
## Tuning parameter 'intercept' was held constant at a value of TRUE | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 4 - - - - - - - - # | |
# - - 5-fold cross-validation for LR - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# loading dataset | |
data(BostonHousing) | |
# changing the name to match what's used in DataCamp | |
Boston <- BostonHousing | |
# fit linear regr model using 5-fold CV | |
model <- train( | |
medv ~ ., Boston, | |
method = "lm", | |
trControl = trainControl( | |
method = "cv", number = 5, | |
verboseIter = TRUE | |
) | |
) | |
# print contents | |
model | |
## Linear Regression | |
## 506 samples | |
## 13 predictor | |
## No pre-processing | |
## Resampling: Cross-Validated (5 fold) | |
## Summary of sample sizes: 405, 405, 404, 406, 404 | |
## Resampling results: | |
## RMSE Rsquared MAE | |
## 4.941179 0.724198 3.429735 | |
## Tuning parameter 'intercept' was held constant at a value of TRUE | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 5 - - - - - - - - # | |
# - 5x5-fold cross-validation for LR - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# an example of repeated cross-validation | |
# 5-fold CV repeated 5 times (5x5) | |
# - takes longer but produces more out-of-sample datasets | |
# - and much more precise assessments of how well the model | |
# - performs | |
# fit linear regr model using 5-fold CV | |
model <- train( | |
medv ~ ., Boston, | |
method = "lm", | |
trControl = trainControl( | |
method = "repeatedcv", number = 5, | |
repeats = 5, verboseIter = TRUE | |
) | |
) | |
# print contents | |
model | |
## Linear Regression | |
## 506 samples | |
## 13 predictor | |
## No pre-processing | |
## Resampling: Cross-Validated (5 fold, repeated 5 times) | |
## Summary of sample sizes: 406, 405, 404, 405, 404, 405, ... | |
## Resampling results: | |
## RMSE Rsquared MAE | |
## 4.852366 0.7213942 3.400954 | |
## Tuning parameter 'intercept' was held constant at a value of TRUE | |
# generate predictions using the full Boston dataset | |
predict(model, Boston) | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 6 - - - - - - - - # | |
# - - 60/40 split for logistic regr - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# shuffle row indices | |
n_obs <- nrow(Sonar) | |
permuted_rows <- sample(n_obs) | |
# randomly order data: Sonar | |
Sonar_shuffled <- Sonar[permuted_rows,] | |
# identify row to split on: split | |
split <- round(nrow(Sonar) * 0.60) | |
# create train | |
train <- Sonar_shuffled[1:split,] | |
# create test | |
test <- Sonar_shuffled[(split+1):nrow(Sonar),] | |
# fit logistic regression | |
model <- glm(Class ~ . , family = 'binomial' , train) | |
# Predict on test: p | |
p <- predict(model, test, type = 'response') | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 7 - - - - - - - - # | |
# - - - calculate a confusion matrix- - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# confusion matrices are used to assess classification | |
# - accuracy. in this case, we're doing a binary classification | |
# - between 'M' and 'R' | |
# testing a threshold: 0.5 | |
# - if p exceeds threshold of 0.5, M else R | |
m_or_r <- ifelse(p > 0.5, 'M', 'R') | |
# convert to factor | |
p_class <- factor(m_or_r, levels = levels(test[['Class']])) | |
# create confusion matrix | |
confusionMatrix(p_class, test[['Class']]) | |
# testing a threshold: 0.9 | |
# - good if we want to be very certain about what we classify as 'M' | |
# - (will get fewer predicted 'M'ines but greater confidence in each | |
# prediction) | |
# - if p exceeds threshold of 0.9, M else R | |
m_or_r <- ifelse(p > 0.9, 'M', 'R') | |
# convert to factor | |
p_class <- factor(m_or_r, levels(test[['Class']])) | |
# create confusion matrix | |
confusionMatrix(p_class, test$Class) | |
# testing a threshold: 0.1 | |
# - good if we want to make sure we capture everything that could | |
# be a mine | |
# - if p exceeds threshold of 0.1, M else R | |
m_or_r <- ifelse(p > 0.1, 'M', 'R') | |
# convert to factor | |
p_class <- factor(m_or_r, levels(test[['Class']])) | |
# create confusion matrix | |
confusionMatrix(p_class, test$Class) | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 8 - - - - - - - - # | |
# - - - - - - - ROC curve - - - - - - - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - an ROC curve evaluates all possible thresholds for splitting predicted | |
# probabilities into predicted classes | |
# - it summarizes the performance of a classifier over all possible thresholds | |
# generate predictions on test data | |
p <- predict(model, test, type="response") | |
plot ROC curve | |
caTools::colAUC(p, test$Class, plotROC=TRUE) | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 9 - - - - - - - - # | |
# - - - - - - - - AUC - - - - - - - - - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# AUC is a single-number summary of a model's ability to discriminate | |
# - the positive from the negative class | |
# * AUC of 0.5 is no better than random guessing | |
# * AUC of 1.0 is a perfectly-predictive model | |
# * AUC of 0.0 is perfectly anti-predictive (rare) | |
# create trainControl object | |
# - use AUC instead of accuracy, agnostic to model-fitting procedures | |
# - 10-fold CV | |
myControl <- trainControl( | |
method = "cv", | |
number = 10, | |
summaryFunction = twoClassSummary, | |
classProbs = TRUE, | |
verboseIter = TRUE | |
) | |
# train glm with custom trainControl | |
model <- train(Class ~ ., | |
Sonar, | |
method = 'glm', | |
trControl = myControl) | |
# print model to console | |
model | |
## Generalized Linear Model | |
## 208 samples | |
## 60 predictor | |
## 2 classes: 'M', 'R' | |
## No pre-processing | |
## Resampling: Cross-Validated (10 fold) | |
## Summary of sample sizes: 187, 188, 187, 188, 188, 187, ... | |
## Resampling results: | |
## ROC Sens Spec | |
## 0.7520455 0.7659091 0.6755556 | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 10 - - - - - - - - # | |
# - - - - - - random forests - - - - - - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# random forests are more flexible than linear models | |
# fit a random forest | |
model <- train( | |
quality ~ ., | |
tuneLength = 1, | |
data = wine, | |
method = 'ranger', # ranger is a pkg that rewrites randomForest | |
trControl = trainControl( | |
method = "cv", | |
number = 5, | |
verboseIter = TRUE | |
) | |
) | |
# print model to console | |
model | |
## Random Forest | |
## 100 samples | |
## 12 predictor | |
## No pre-processing | |
## Resampling: Cross-Validated (5 fold) | |
## Summary of sample sizes: 79, 80, 80, 80, 81 | |
## Resampling results across tuning parameters: | |
## splitrule RMSE Rsquared MAE | |
## variance 0.6444236 0.3115906 0.4849863 | |
## extratrees 0.6674696 0.2769533 0.4989182 | |
## Tuning parameter 'mtry' was held constant at a value of 3 | |
## Tuning | |
## parameter 'min.node.size' was held constant at a value of 5 | |
## RMSE was used to select the optimal model using the smallest value. | |
## The final values used for the model were mtry = 3, splitrule = variance | |
## and min.node.size = 5. | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 11 - - - - - - - - # | |
# - - - - - - tuning models - - - - - - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# tuning grid is calibrated using the tuneLength argument. | |
# the larger, the more fine the grid (default is 3). | |
# fit random forest | |
model <- train( | |
quality ~ ., | |
tuneLength = 3, | |
data = wine, | |
method = "ranger", | |
trControl = trainControl( | |
method = "cv", | |
number = 5, | |
verboseIter = TRUE | |
) | |
) | |
# print model to console | |
model | |
## Random Forest | |
## 100 samples | |
## 12 predictor | |
## No pre-processing | |
## Resampling: Cross-Validated (5 fold) | |
## Summary of sample sizes: 80, 79, 79, 81, 81 | |
## Resampling results across tuning parameters: | |
## mtry splitrule RMSE Rsquared MAE | |
## 2 variance 0.7830336 0.2515022 0.6161859 | |
## 2 extratrees 0.7750128 0.2799630 0.6058953 | |
## 7 variance 0.7807803 0.2383175 0.6247166 | |
## 7 extratrees 0.7612100 0.2796607 0.5940507 | |
## 12 variance 0.7831554 0.2313966 0.6312649 | |
## 12 extratrees 0.7679918 0.2570055 0.5979680 | |
## Tuning parameter 'min.node.size' was held constant at a value of 5 | |
## RMSE was used to select the optimal model using the smallest value. | |
## The final values used for the model were mtry = 7, splitrule = extratrees | |
## and min.node.size = 5. | |
# plot model | |
plot(model) # 1.png | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 12 - - - - - - - - # | |
# - - - - custom tuning grids - - - - - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# more fine-grained control over the tuning parameters | |
# define the tuning grid | |
tuneGrid <- data.frame( | |
.mtry = c(2,3,7), | |
.splitrule = "variance", | |
.min.node.size = 5 | |
) | |
# fit random forest | |
model <- train( | |
quality ~ ., | |
tuneGrid = tuneGrid, | |
data = wine, | |
method = 'ranger', | |
trControl = trainControl( | |
method = "cv", | |
number = 5, | |
verboseIter = TRUE | |
) | |
) | |
# print model to console | |
model | |
## Random Forest | |
## 100 samples | |
## 12 predictor | |
## No pre-processing | |
## Resampling: Cross-Validated (5 fold) | |
## Summary of sample sizes: 79, 80, 80, 80, 81 | |
## Resampling results across tuning parameters: | |
## mtry RMSE Rsquared MAE | |
## 2 0.6525708 0.3008584 0.4888581 | |
## 3 0.6402476 0.3258908 0.4790033 | |
## 7 0.6404442 0.3142528 0.4760855 | |
## Tuning parameter 'splitrule' was held constant at a value of variance | |
## Tuning parameter 'min.node.size' was held constant at a value of 5 | |
## RMSE was used to select the optimal model using the smallest value. | |
## The final values used for the model were mtry = 3, splitrule = variance | |
## and min.node.size = 5. | |
plot(model) # 2.png | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 13 - - - - - - - - # | |
# - - - - - - - - glmnet - - - - - - - - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# glmnet is better than glm in that it places constraints on | |
# - regression coefficients, which helps prevent overfitting | |
# create custom trainControl | |
myControl <- trainControl( | |
method = "cv", | |
number = 10, | |
summaryFunction = twoClassSummary, | |
classProbs = TRUE, # IMPORTANT! | |
verboseIter = TRUE | |
) | |
# fit glmnet model | |
model <- train( | |
y ~ ., | |
overfit, | |
method = "glmnet", | |
trControl = myControl | |
) | |
# print model to console | |
model | |
## glmnet | |
## 250 samples | |
## 200 predictors | |
## 2 classes: 'class1', 'class2' | |
## No pre-processing | |
## Resampling: Cross-Validated (10 fold) | |
## Summary of sample sizes: 226, 225, 225, 224, 225, 225, ... | |
## Resampling results across tuning parameters: | |
## alpha lambda ROC Sens Spec | |
## 0.10 0.0001012745 0.4345109 0.05 0.9697464 | |
## 0.10 0.0010127448 0.4365942 0.05 0.9740942 | |
## 0.10 0.0101274483 0.4580616 0.05 0.9826087 | |
## 0.55 0.0001012745 0.3893116 0.05 0.9568841 | |
## 0.55 0.0010127448 0.3850543 0.05 0.9697464 | |
## 0.55 0.0101274483 0.4280797 0.05 0.9826087 | |
## 1.00 0.0001012745 0.3662138 0.05 0.9353261 | |
## 1.00 0.0010127448 0.3747283 0.05 0.9481884 | |
## 1.00 0.0101274483 0.4049819 0.00 0.9869565 | |
## ROC was used to select the optimal model using the largest value. | |
## The final values used for the model were alpha = 0.1 and lambda = 0.01012745. | |
# print maximum ROC statistic | |
max(model$results$ROC) | |
## [1] 0.4580616 | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 14 - - - - - - - - # | |
# - - - glmnet with tuning grid - - - - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# the default tuning grid is very small | |
# - and there are many more potential glmnet models we'd | |
# - want to explore | |
# train glmnet with custom trainControl and tuning | |
model <- train( | |
y ~ ., | |
overfit, | |
tuneGrid = expand.grid( | |
alpha = 0:1, # 0 is pure ridge, 1 is pure lasso | |
lambda = seq(0.0001, 1, length = 20) # max lambda is 10 | |
), | |
method = 'glmnet', | |
trControl = myControl | |
) | |
# print model to console | |
model | |
# print maximum ROC statistic | |
max(model$results$ROC) # 3.png | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 15 - - - - - - - - # | |
# - - - - median imputation - - - - - - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# apply median imputation (pre-processing step) | |
median_model <- train( | |
x = breast_cancer_x, | |
y = breast_cancer_y, | |
method = "glm", | |
trControl = myControl, | |
preProcess = "medianImpute" | |
) | |
# print median_model to console | |
median_model | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 16 - - - - - - - - # | |
# - - - - - - KNN imputation - - - - - - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# an alternative imputation method to try alongside | |
# - median imputation, then the method that produces | |
# - the most accurate method is chosen (e.g., the one | |
# - with the highest ROC metric) | |
# apply KNN imputation | |
knn_model <- train( | |
x = breast_cancer_x, | |
y = breast_cancer_y, | |
method = 'glm', | |
trControl = myControl, | |
preProcess = 'knnImpute' | |
) | |
# print knn_model to console | |
knn_model | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 17 - - - - - - - - # | |
# - - - multiple pre-processing steps - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# in caret's preProcess(), median imputation comes before | |
# - centering and scaling of vars (i.e., standardization) | |
# fit glm with median imputation | |
model <- train( | |
x = breast_cancer_x, | |
y = breast_cancer_y, | |
method = "glm", | |
trControl = myControl, | |
preProcess = c('medianImpute','center','scale') | |
) | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 18 - - - - - - - - # | |
# - - - removing zero var predictor - - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# removing near-zero variance predictors reduces model-fitting | |
# - time without reducing model accuracy. | |
# - also precludes some issues during CV where a predictor takes on | |
# - only one value within that fold | |
# identify near zero variance predictors | |
remove_cols <- nearZeroVar(bloodbrain_x, names = TRUE, | |
freqCut = 2, uniqueCut = 20) | |
# get all column names from bloodbrain_x | |
all_cols = names(bloodbrain_x) | |
# remove from data | |
bloodbrain_x_small <- bloodbrain_x[ , setdiff(all_cols, remove_cols)] | |
# fit model on reduced data | |
model <- train( | |
x = bloodbrain_x_small, | |
y = bloodbrain_y, | |
method = "glm" | |
) | |
# NOTE can also remove near/zero variance predictors using | |
# preProcess = 'nzv' within train() | |
# preProcess = 'zv' within train() | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 19 - - - - - - - - # | |
# - - - PCA as a preprocessing step - - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# PCA can be used as a preprocessing step, as an alternative | |
# to removing low-variance predictors. | |
# fit glm model using PCA | |
model <- train( | |
x = bloodbrain_x, | |
y = bloodbrain_y, | |
method = 'glm', | |
preProcess = 'pca' | |
) | |
# - - - - - - - - - - - - - - - - - - - - # | |
# - - - - - - - EXAMPLE 20 - - - - - - - - # | |
# - - - - running a full analysis! - - - - # | |
# - - - - - - - - - - - - - - - - - - - - # | |
# key here is the reuse of a trainControl(), which has several benefits | |
# - can use the same summaryFunction() and tuning parameters for multiple models | |
# - don't have to repeat code when fitting multiple models | |
# - can compare models on the exact same training and test data | |
# create custom indices | |
myFolds <- createFolds(churn_y, k = 5) | |
# create reusable trainControl object | |
myControl <- trainControl( | |
summaryFunction = twoClassSummary, | |
classProbs = TRUE, # IMPORTANT! | |
verboseIter = TRUE, | |
savePredictions = TRUE, | |
index = myFolds | |
) | |
# fit the baseline model w/ glmnet | |
model_glmnet <- train( | |
x = churn_x, | |
y = churn_y, | |
metric = "ROC", | |
method = 'glmnet', | |
trControl = myControl | |
) | |
# fit the baseline model w/ RF | |
# - slower to fit and less interpretable than glmnet | |
# - often more accurate than glmnet | |
# - easier to tune | |
# - require little preprocessing | |
# - capture threshold effects and variable interactions | |
model_rf <- train( | |
x = churn_x, | |
y = churn_y, | |
metric = "ROC", | |
method = 'ranger', | |
trControl = myControl | |
) | |
# compare the glmnet model and RF model using resamples() | |
# - first we have to make sure the models being compared were | |
# - fit on exactly the same training and test sets during CV | |
# - so that we're sure that we're making an apples-to-apples comparison | |
# - of their results. | |
# SELECTION CRITERIA | |
# - highest average AUC | |
# - low standard deviation in AUC | |
# create model_list | |
model_list <- list(item1 = model_glmnet, item2 = model_rf) | |
# pass model_list to resamples() | |
resamples <- resamples(model_list) | |
# summarize the results | |
summary(resamples) | |
# create bwplot (also part of summarization) | |
bwplot(resamples, metric = 'ROC') # 4.png | |
# create xyplot (also part of summarization) | |
xyplot(resamples, metric='ROC') # 5.png |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment