RobWiederstein/03_evaluate_model.R

## 03_evaluate_model.R
#################################################################
##                          tidymodels                         ##
##              3 Evaluate Model with Resampling               ##
##      url: https://www.tidymodels.org/start/resampling/      ##
#################################################################

# 1.0 INTRODUCTION ----
#Resampling measures how well a model predicts new data
#predict image segmentation quality
        ## 1.1 Load tidymodels ----
        library(tidymodels) # for the resample package, along with the rest of tidymodels
        ## 1.2 Load helper packages -----
        library(modeldata)  # for the cells data
# 2.0 THE CELL IMAGE DATA ----
        ## 2.1 Load data ----
        data(cells, package = "modeldata")
        ## 2.2 Outcome variable is 'class'
        # PS = "poorly segmented" WS = "weekly segmented"
        cells %>%
                count(class) %>%
                mutate(prop = n/sum(n))
# 3.0 DATA SPLITTING ----
#The function rsample::initial_split() takes the original data and saves
#the information on how to make the partitions. In the original analysis,
#the authors made their own training/test set and that information is
#contained in the column "case". To demonstrate how to make a split, we’ll
#remove this column before we make our own split:
set.seed(123)
cell_split <- rsample::initial_split(cells %>% select(-case),
                            strata = class)
#Here we used the strata argument, which conducts a stratified split. This
#ensures that, despite the imbalance we noticed in our class variable, our
#training and test data sets will keep roughly the same proportions of poorly #
#and well-segmented cells as in the original data. After the initial_split,
#the training() and testing() functions return the actual data sets.
cell_train <- training(cell_split)
cell_test  <- testing(cell_split)
# 4.0 CREATE MODEL
#One of the benefits of a random forest model is that it is very low maintenance;
#it requires very little preprocessing of the data and the default parameters
#tend to give reasonable results. For that reason, we won’t create a recipe for
#the cells data.
rf_mod <-
        rand_forest(trees = 1000) %>%
        set_engine("ranger") %>%
        set_mode("classification")
# 4.0 MODELING ----
# This new rf_fit object is the fitted model, trained on the training data set
set.seed(234)
rf_fit <-
        rf_mod %>%
        fit(class ~ ., data = cell_train)
# 5.0 ESTIMATING PERFORMANCE ----
# Performance can be measure by overall classification accuracy and the Receiver Operating Characteristic (ROC) curve, and
# The yardstick package has functions for computing both of these
#measures called roc_auc() and accuracy(). Don't use the training set for this.
#You must resample the training set to get reliable estimates.

# 6.0 FIT A MODEL WITH RESAMPLING ----
        ## 6.1 Fit model ----
        set.seed(345)
        folds <- vfold_cv(cell_train, v = 10)
        folds
        rf_wf <-
                workflow() %>%
                add_model(rf_mod) %>%
                add_formula(class ~ .)

        set.seed(456)
        rf_fit_rs <-
                rf_wf %>%
                fit_resamples(folds)
        ## 6.2 Collect metrics ----
        collect_metrics(rf_fit_rs)
# 7.0 CONCLUSION ----
#Think about these values we now have for accuracy and AUC. These performance
#metrics are now more realistic (i.e. lower) than our ill-advised first attempt
#at computing performance metrics in the section above.
rf_testing_pred <-                      # original bad idea
        predict(rf_fit, cell_test) %>%
        bind_cols(predict(rf_fit, cell_test, type = "prob")) %>%
        bind_cols(cell_test %>% select(class))
rf_testing_pred %>%                   # test set predictions
        roc_auc(truth = class, .pred_PS)
rf_testing_pred %>%                   # test set predictions
        accuracy(truth = class, .pred_class)
	#################################################################
	## tidymodels ##
	## 3 Evaluate Model with Resampling ##
	## url: https://www.tidymodels.org/start/resampling/ ##
	#################################################################

	# 1.0 INTRODUCTION ----
	#Resampling measures how well a model predicts new data
	#predict image segmentation quality
	## 1.1 Load tidymodels ----
	library(tidymodels) # for the resample package, along with the rest of tidymodels
	## 1.2 Load helper packages -----
	library(modeldata) # for the cells data
	# 2.0 THE CELL IMAGE DATA ----
	## 2.1 Load data ----
	data(cells, package = "modeldata")
	## 2.2 Outcome variable is 'class'
	# PS = "poorly segmented" WS = "weekly segmented"
	cells %>%
	count(class) %>%
	mutate(prop = n/sum(n))
	# 3.0 DATA SPLITTING ----
	#The function rsample::initial_split() takes the original data and saves
	#the information on how to make the partitions. In the original analysis,
	#the authors made their own training/test set and that information is
	#contained in the column "case". To demonstrate how to make a split, we’ll
	#remove this column before we make our own split:
	set.seed(123)
	cell_split <- rsample::initial_split(cells %>% select(-case),
	strata = class)
	#Here we used the strata argument, which conducts a stratified split. This
	#ensures that, despite the imbalance we noticed in our class variable, our
	#training and test data sets will keep roughly the same proportions of poorly #
	#and well-segmented cells as in the original data. After the initial_split,
	#the training() and testing() functions return the actual data sets.
	cell_train <- training(cell_split)
	cell_test <- testing(cell_split)
	# 4.0 CREATE MODEL
	#One of the benefits of a random forest model is that it is very low maintenance;
	#it requires very little preprocessing of the data and the default parameters
	#tend to give reasonable results. For that reason, we won’t create a recipe for
	#the cells data.
	rf_mod <-
	rand_forest(trees = 1000) %>%
	set_engine("ranger") %>%
	set_mode("classification")
	# 4.0 MODELING ----
	# This new rf_fit object is the fitted model, trained on the training data set
	set.seed(234)
	rf_fit <-
	rf_mod %>%
	fit(class ~ ., data = cell_train)
	# 5.0 ESTIMATING PERFORMANCE ----
	# Performance can be measure by overall classification accuracy and the Receiver Operating Characteristic (ROC) curve, and
	# The yardstick package has functions for computing both of these
	#measures called roc_auc() and accuracy(). Don't use the training set for this.
	#You must resample the training set to get reliable estimates.

	# 6.0 FIT A MODEL WITH RESAMPLING ----
	## 6.1 Fit model ----
	set.seed(345)
	folds <- vfold_cv(cell_train, v = 10)
	folds
	rf_wf <-
	workflow() %>%
	add_model(rf_mod) %>%
	add_formula(class ~ .)

	set.seed(456)
	rf_fit_rs <-
	rf_wf %>%
	fit_resamples(folds)
	## 6.2 Collect metrics ----
	collect_metrics(rf_fit_rs)
	# 7.0 CONCLUSION ----
	#Think about these values we now have for accuracy and AUC. These performance
	#metrics are now more realistic (i.e. lower) than our ill-advised first attempt
	#at computing performance metrics in the section above.
	rf_testing_pred <- # original bad idea
	predict(rf_fit, cell_test) %>%
	bind_cols(predict(rf_fit, cell_test, type = "prob")) %>%
	bind_cols(cell_test %>% select(class))
	rf_testing_pred %>% # test set predictions
	roc_auc(truth = class, .pred_PS)
	rf_testing_pred %>% # test set predictions
	accuracy(truth = class, .pred_class)