Created
May 30, 2021 18:56
-
-
Save RobWiederstein/bb21985f3998041aeeea7dc2044df1f0 to your computer and use it in GitHub Desktop.
second tutorial from tidymodels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################## | |
## tidymodels ## | |
## 2 Preprocess with Recipes ## | |
## url: https://www.tidymodels.org/start/recipes/ ## | |
################################################################## | |
# 1.0 INTRODUCTION ---- | |
## 1.1 tidymodels ---- | |
library(tidymodels) | |
## 1.2 Helper packages | |
library(nycflights13) # for flight data | |
library(skimr) # for variable summaries | |
# 2.0 THE NEW YORK CITY FLIGHT DATA ---- | |
## 2.1 Set seed ---- | |
set.seed(123) | |
## 2.2 Load ---- | |
data(flights) | |
## 2.3 View original ---- | |
skimr::skim(flights) | |
## 2.4 Change ---- | |
flight_data <- | |
flights %>% | |
mutate( | |
# Convert the arrival delay to a factor | |
arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), | |
arr_delay = factor(arr_delay), | |
# We will use the date (not date-time) in the recipe below | |
date = as.Date(time_hour) | |
) %>% | |
# Include the weather data | |
inner_join(weather, by = c("origin", "time_hour")) %>% | |
# Only retain the specific columns we will use | |
select(dep_time, flight, origin, dest, air_time, distance, | |
carrier, date, arr_delay, time_hour) %>% | |
# Exclude missing data | |
na.omit() %>% | |
# For creating models, it is better to have qualitative columns | |
# encoded as factors (instead of character strings) | |
mutate_if(is.character, as.factor) | |
##2.5 View changes ----- | |
glimpse(flight_data) | |
#3.0 DATA SPLITTING ---- | |
## 3.1 Set seed ---- | |
set.seed(555) | |
## 3.2 Split ---- | |
data_split <- initial_split(flight_data, prop = 3/4) | |
## 3.3 Training & Testing ---- | |
train_data <- training(data_split) | |
test_data <- testing(data_split) | |
# 4.0 CREATE RECIPE AND ROLES ---- | |
## 4.1 Original recipe ---- | |
flights_rec <- | |
recipe(arr_delay ~ ., data = train_data) | |
## 4.2 New recipe ---- | |
#two columns designated as 'ID' and not variables | |
flights_rec <- | |
recipe(arr_delay ~ ., data = train_data) %>% | |
update_role(flight, time_hour, new_role = "ID") | |
summary(flights_rec) | |
# 5.0 FEATURE ENGINEERING ---- | |
flights_rec <- | |
recipe(arr_delay ~ ., data = train_data) %>% | |
update_role(flight, time_hour, new_role = "ID") %>% | |
step_date(date, features = c("dow", "month")) %>% | |
step_holiday(date, holidays = timeDate::listHolidays("US")) %>% | |
step_rm(date) %>% | |
step_dummy(all_nominal(), -all_outcomes()) %>% | |
step_zv(all_predictors()) | |
# 6.0 FIT A MODEL WITH A RECIPE ---- | |
## 6.1 Specify model ---- | |
lr_mod <- | |
logistic_reg() %>% | |
set_engine("glm") | |
## 6.2 Specify workflow ---- | |
flights_wflow <- | |
workflow() %>% | |
add_model(lr_mod) %>% | |
add_recipe(flights_rec) | |
flights_wflow | |
## 6.3 Fit model ---- | |
flights_fit <- | |
flights_wflow %>% | |
fit(data = train_data) | |
## 6.4 Extract results ---- | |
flights_fit %>% | |
pull_workflow_fit() %>% | |
tidy() | |
# 7.0 USE A TRAINED WORKFLOW TO PREDICT ---- | |
## 7.1 Predict -- returns factor ---- | |
predict(flights_fit, test_data) | |
## 7.2 Predict -- return probability ---- | |
flights_pred <- | |
predict(flights_fit, test_data, type = "prob") %>% | |
bind_cols(test_data %>% select(arr_delay, time_hour, flight)) | |
flights_pred | |
## 7.3 Plot -- library('yardstick') ---- | |
flights_pred %>% | |
roc_curve(truth = arr_delay, .pred_late) %>% | |
autoplot() | |
## 7.4 Area under ROC curve ---- | |
flights_pred %>% | |
roc_auc(truth = arr_delay, .pred_late) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment