Created
May 30, 2021 18:58
-
-
Save RobWiederstein/8def57b19749f3ee28d926d37407f220 to your computer and use it in GitHub Desktop.
third tutorial from tidymodels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################# | |
## tidymodels ## | |
## 3 Evaluate Model with Resampling ## | |
## url: https://www.tidymodels.org/start/resampling/ ## | |
################################################################# | |
# 1.0 INTRODUCTION ---- | |
#Resampling measures how well a model predicts new data | |
#predict image segmentation quality | |
## 1.1 Load tidymodels ---- | |
library(tidymodels) # for the resample package, along with the rest of tidymodels | |
## 1.2 Load helper packages ----- | |
library(modeldata) # for the cells data | |
# 2.0 THE CELL IMAGE DATA ---- | |
## 2.1 Load data ---- | |
data(cells, package = "modeldata") | |
## 2.2 Outcome variable is 'class' | |
# PS = "poorly segmented" WS = "weekly segmented" | |
cells %>% | |
count(class) %>% | |
mutate(prop = n/sum(n)) | |
# 3.0 DATA SPLITTING ---- | |
#The function rsample::initial_split() takes the original data and saves | |
#the information on how to make the partitions. In the original analysis, | |
#the authors made their own training/test set and that information is | |
#contained in the column "case". To demonstrate how to make a split, we’ll | |
#remove this column before we make our own split: | |
set.seed(123) | |
cell_split <- rsample::initial_split(cells %>% select(-case), | |
strata = class) | |
#Here we used the strata argument, which conducts a stratified split. This | |
#ensures that, despite the imbalance we noticed in our class variable, our | |
#training and test data sets will keep roughly the same proportions of poorly # | |
#and well-segmented cells as in the original data. After the initial_split, | |
#the training() and testing() functions return the actual data sets. | |
cell_train <- training(cell_split) | |
cell_test <- testing(cell_split) | |
# 4.0 CREATE MODEL | |
#One of the benefits of a random forest model is that it is very low maintenance; | |
#it requires very little preprocessing of the data and the default parameters | |
#tend to give reasonable results. For that reason, we won’t create a recipe for | |
#the cells data. | |
rf_mod <- | |
rand_forest(trees = 1000) %>% | |
set_engine("ranger") %>% | |
set_mode("classification") | |
# 4.0 MODELING ---- | |
# This new rf_fit object is the fitted model, trained on the training data set | |
set.seed(234) | |
rf_fit <- | |
rf_mod %>% | |
fit(class ~ ., data = cell_train) | |
# 5.0 ESTIMATING PERFORMANCE ---- | |
# Performance can be measure by overall classification accuracy and the Receiver Operating Characteristic (ROC) curve, and | |
# The yardstick package has functions for computing both of these | |
#measures called roc_auc() and accuracy(). Don't use the training set for this. | |
#You must resample the training set to get reliable estimates. | |
# 6.0 FIT A MODEL WITH RESAMPLING ---- | |
## 6.1 Fit model ---- | |
set.seed(345) | |
folds <- vfold_cv(cell_train, v = 10) | |
folds | |
rf_wf <- | |
workflow() %>% | |
add_model(rf_mod) %>% | |
add_formula(class ~ .) | |
set.seed(456) | |
rf_fit_rs <- | |
rf_wf %>% | |
fit_resamples(folds) | |
## 6.2 Collect metrics ---- | |
collect_metrics(rf_fit_rs) | |
# 7.0 CONCLUSION ---- | |
#Think about these values we now have for accuracy and AUC. These performance | |
#metrics are now more realistic (i.e. lower) than our ill-advised first attempt | |
#at computing performance metrics in the section above. | |
rf_testing_pred <- # original bad idea | |
predict(rf_fit, cell_test) %>% | |
bind_cols(predict(rf_fit, cell_test, type = "prob")) %>% | |
bind_cols(cell_test %>% select(class)) | |
rf_testing_pred %>% # test set predictions | |
roc_auc(truth = class, .pred_PS) | |
rf_testing_pred %>% # test set predictions | |
accuracy(truth = class, .pred_class) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment