Skip to content

Instantly share code, notes, and snippets.

@RobWiederstein
Created May 30, 2021 18:56
Show Gist options
  • Save RobWiederstein/bb21985f3998041aeeea7dc2044df1f0 to your computer and use it in GitHub Desktop.
Save RobWiederstein/bb21985f3998041aeeea7dc2044df1f0 to your computer and use it in GitHub Desktop.
second tutorial from tidymodels
##################################################################
## tidymodels ##
## 2 Preprocess with Recipes ##
## url: https://www.tidymodels.org/start/recipes/ ##
##################################################################
# 1.0 INTRODUCTION ----
## 1.1 tidymodels ----
library(tidymodels)
## 1.2 Helper packages
library(nycflights13) # for flight data
library(skimr) # for variable summaries
# 2.0 THE NEW YORK CITY FLIGHT DATA ----
## 2.1 Set seed ----
set.seed(123)
## 2.2 Load ----
data(flights)
## 2.3 View original ----
skimr::skim(flights)
## 2.4 Change ----
flight_data <-
flights %>%
mutate(
# Convert the arrival delay to a factor
arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
arr_delay = factor(arr_delay),
# We will use the date (not date-time) in the recipe below
date = as.Date(time_hour)
) %>%
# Include the weather data
inner_join(weather, by = c("origin", "time_hour")) %>%
# Only retain the specific columns we will use
select(dep_time, flight, origin, dest, air_time, distance,
carrier, date, arr_delay, time_hour) %>%
# Exclude missing data
na.omit() %>%
# For creating models, it is better to have qualitative columns
# encoded as factors (instead of character strings)
mutate_if(is.character, as.factor)
##2.5 View changes -----
glimpse(flight_data)
#3.0 DATA SPLITTING ----
## 3.1 Set seed ----
set.seed(555)
## 3.2 Split ----
data_split <- initial_split(flight_data, prop = 3/4)
## 3.3 Training & Testing ----
train_data <- training(data_split)
test_data <- testing(data_split)
# 4.0 CREATE RECIPE AND ROLES ----
## 4.1 Original recipe ----
flights_rec <-
recipe(arr_delay ~ ., data = train_data)
## 4.2 New recipe ----
#two columns designated as 'ID' and not variables
flights_rec <-
recipe(arr_delay ~ ., data = train_data) %>%
update_role(flight, time_hour, new_role = "ID")
summary(flights_rec)
# 5.0 FEATURE ENGINEERING ----
flights_rec <-
recipe(arr_delay ~ ., data = train_data) %>%
update_role(flight, time_hour, new_role = "ID") %>%
step_date(date, features = c("dow", "month")) %>%
step_holiday(date, holidays = timeDate::listHolidays("US")) %>%
step_rm(date) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_zv(all_predictors())
# 6.0 FIT A MODEL WITH A RECIPE ----
## 6.1 Specify model ----
lr_mod <-
logistic_reg() %>%
set_engine("glm")
## 6.2 Specify workflow ----
flights_wflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(flights_rec)
flights_wflow
## 6.3 Fit model ----
flights_fit <-
flights_wflow %>%
fit(data = train_data)
## 6.4 Extract results ----
flights_fit %>%
pull_workflow_fit() %>%
tidy()
# 7.0 USE A TRAINED WORKFLOW TO PREDICT ----
## 7.1 Predict -- returns factor ----
predict(flights_fit, test_data)
## 7.2 Predict -- return probability ----
flights_pred <-
predict(flights_fit, test_data, type = "prob") %>%
bind_cols(test_data %>% select(arr_delay, time_hour, flight))
flights_pred
## 7.3 Plot -- library('yardstick') ----
flights_pred %>%
roc_curve(truth = arr_delay, .pred_late) %>%
autoplot()
## 7.4 Area under ROC curve ----
flights_pred %>%
roc_auc(truth = arr_delay, .pred_late)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment