Skip to content

Instantly share code, notes, and snippets.

@RobWiederstein
Last active May 30, 2021 18:57
Show Gist options
  • Save RobWiederstein/4b47ffac86d170413c750a43c16ea896 to your computer and use it in GitHub Desktop.
Save RobWiederstein/4b47ffac86d170413c750a43c16ea896 to your computer and use it in GitHub Desktop.
first tutorial from tidymodels
#################################################################
## tidymodels ##
## 1 build a model ##
## url: https://www.tidymodels.org/start/models/ ##
#################################################################
# 1.0 INTRODUCTION ----
## 1.1 Parsnip pkg + the rest of tidymodels ----
library(tidymodels)
## 1.2 Helper packages ----
library(readr) # for importing data
library(broom.mixed) # for converting bayesian models to tidy tibbles
library(dotwhisker) # for visualizing regression results
# 2.0 SEA URCHINS DATA ----
## 2.1 About ----
# https://link.springer.com/article/10.1007/BF00349318
## 2.2 Read-in ----
urchins <-
# Data were assembled for a tutorial
# at https://www.flutterbys.com.au/stats/tut/tut7.5a.html
read_csv("https://tidymodels.org/start/models/urchins.csv") %>%
# Change the names to be a little more verbose
setNames(c("food_regime", "initial_volume", "width")) %>%
# Factors are very helpful for modeling, so we convert one column
mutate(food_regime = factor(food_regime, levels = c("Initial", "Low", "High")))
## 2.3 Plot ----
ggplot(urchins,
aes(x = initial_volume,
y = width,
group = food_regime,
col = food_regime)) +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
scale_color_viridis_d(option = "plasma", end = .7)
# 3.0 BUILD AND FIT A MODEL ----
#A standard two-way analysis of variance (ANOVA) model makes sense for this
#dataset because it contains a continuous and categorical predictor
## 3.1 Designate model ----
lm_mod <-
linear_reg() %>%
set_engine("lm")
## 3.2 Train/fit/estimate model ----
lm_fit <-
lm_mod %>%
fit(width ~ initial_volume * food_regime, data = urchins)
## 3.3 print tidy ----
tidy(lm_fit)
## 3.4 plot results ----
tidy(lm_fit) %>%
dwplot(dot_args = list(size = 2, color = "black"),
whisker_args = list(color = "black"),
vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2))
# 4.0 USE A MODEL TO PREDICT ----
## 4.1 Create new points ----
new_points <- expand.grid(initial_volume = 20,
food_regime = c("Initial", "Low", "High"))
## 4.2 Fit model to new data points ----
mean_pred <- predict(lm_fit, new_data = new_points)
## 4.3 Create confidence intervals for predictions ----
conf_int_pred <- predict(lm_fit,
new_data = new_points,
type = "conf_int")
## 4.4 New data: new points + estimates + conf int ----
plot_data <-
new_points %>%
bind_cols(mean_pred) %>%
bind_cols(conf_int_pred)
## 4.5 And then plot . . . ----
ggplot(plot_data, aes(x = food_regime)) +
geom_point(aes(y = .pred)) +
geom_errorbar(aes(ymin = .pred_lower,
ymax = .pred_upper),
width = .2) +
labs(y = "urchin size")
# 5.0 MODEL WITH A DIFFERENT ENGINE ----
## 5.1 Set the prior distribution ----
#couldn't get 'rstanarm' installed
prior_dist <- rstanarm::student_t(df = 1)
set.seed(123)
## 5.2 Make the parsnip model ----
bayes_mod <-
linear_reg() %>%
set_engine("stan",
prior_intercept = prior_dist,
prior = prior_dist)
## 5.3 Train the model ----
bayes_fit <-
bayes_mod %>%
fit(width ~ initial_volume * food_regime, data = urchins)
## 5.4 Print the model ----
print(bayes_fit, digits = 5)
# 6.0 WHY DOES IT WORK THAT WAY? ----
#the modeling code uses the pipe to pass around the model object:
bayes_mod %>%
fit(width ~ initial_volume * food_regime, data = urchins)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment