RobWiederstein/01_build_model.R

## 01_build_model.R
#################################################################
##                          tidymodels                         ##
##                       1 build a model                       ##
##        url: https://www.tidymodels.org/start/models/        ##
#################################################################
# 1.0 INTRODUCTION ----
        ## 1.1 Parsnip pkg + the rest of tidymodels ----
        library(tidymodels)
        ## 1.2 Helper packages ----
        library(readr)       # for importing data
        library(broom.mixed) # for converting bayesian models to tidy tibbles
        library(dotwhisker)  # for visualizing regression results
# 2.0 SEA URCHINS DATA ----
        ## 2.1 About ----
        # https://link.springer.com/article/10.1007/BF00349318
        ## 2.2 Read-in ----
        urchins <-
                # Data were assembled for a tutorial
                # at https://www.flutterbys.com.au/stats/tut/tut7.5a.html
                read_csv("https://tidymodels.org/start/models/urchins.csv") %>%
                # Change the names to be a little more verbose
                setNames(c("food_regime", "initial_volume", "width")) %>%
                # Factors are very helpful for modeling, so we convert one column
                mutate(food_regime = factor(food_regime, levels = c("Initial", "Low", "High")))
        ## 2.3 Plot ----
        ggplot(urchins,
               aes(x = initial_volume,
                   y = width,
                   group = food_regime,
                   col = food_regime)) +
                geom_point() +
                geom_smooth(method = lm, se = FALSE) +
                scale_color_viridis_d(option = "plasma", end = .7)
# 3.0 BUILD AND FIT A MODEL ----
#A standard two-way analysis of variance (ANOVA) model makes sense for this
#dataset because it contains a continuous and categorical predictor
        ## 3.1 Designate model ----
        lm_mod <-
                linear_reg() %>%
                set_engine("lm")
        ## 3.2 Train/fit/estimate model ----
        lm_fit <-
                lm_mod %>%
                fit(width ~ initial_volume * food_regime, data = urchins)
        ## 3.3 print tidy ----
        tidy(lm_fit)
        ## 3.4 plot results ----
        tidy(lm_fit) %>%
                dwplot(dot_args = list(size = 2, color = "black"),
                       whisker_args = list(color = "black"),
                       vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2))
# 4.0 USE A MODEL TO PREDICT ----
        ## 4.1 Create new points ----
        new_points <- expand.grid(initial_volume = 20,
                          food_regime = c("Initial", "Low", "High"))
        ## 4.2 Fit model to new data points ----
        mean_pred <- predict(lm_fit, new_data = new_points)
        ## 4.3 Create confidence intervals for predictions ----
        conf_int_pred <- predict(lm_fit,
                                 new_data = new_points,
                                 type = "conf_int")
        ## 4.4 New data: new points + estimates + conf int ----
        plot_data <-
                new_points %>%
                bind_cols(mean_pred) %>%
                bind_cols(conf_int_pred)
        ## 4.5 And then plot . . . ----
        ggplot(plot_data, aes(x = food_regime)) +
                geom_point(aes(y = .pred)) +
                geom_errorbar(aes(ymin = .pred_lower,
                                  ymax = .pred_upper),
                              width = .2) +
                labs(y = "urchin size")
# 5.0 MODEL WITH A DIFFERENT ENGINE ----
        ## 5.1 Set the prior distribution ----
        #couldn't get 'rstanarm' installed
        prior_dist <- rstanarm::student_t(df = 1)
        set.seed(123)
        ## 5.2 Make the parsnip model ----
        bayes_mod <-
                linear_reg() %>%
                set_engine("stan",
                           prior_intercept = prior_dist,
                           prior = prior_dist)
        ## 5.3 Train the model ----
        bayes_fit <-
                bayes_mod %>%
                fit(width ~ initial_volume * food_regime, data = urchins)
        ## 5.4 Print the model ----
        print(bayes_fit, digits = 5)
# 6.0 WHY DOES IT WORK THAT WAY? ----
        #the modeling code uses the pipe to pass around the model object:
        bayes_mod %>%
                fit(width ~ initial_volume * food_regime, data = urchins)
	#################################################################
	## tidymodels ##
	## 1 build a model ##
	## url: https://www.tidymodels.org/start/models/ ##
	#################################################################
	# 1.0 INTRODUCTION ----
	## 1.1 Parsnip pkg + the rest of tidymodels ----
	library(tidymodels)
	## 1.2 Helper packages ----
	library(readr) # for importing data
	library(broom.mixed) # for converting bayesian models to tidy tibbles
	library(dotwhisker) # for visualizing regression results
	# 2.0 SEA URCHINS DATA ----
	## 2.1 About ----
	# https://link.springer.com/article/10.1007/BF00349318
	## 2.2 Read-in ----
	urchins <-
	# Data were assembled for a tutorial
	# at https://www.flutterbys.com.au/stats/tut/tut7.5a.html
	read_csv("https://tidymodels.org/start/models/urchins.csv") %>%
	# Change the names to be a little more verbose
	setNames(c("food_regime", "initial_volume", "width")) %>%
	# Factors are very helpful for modeling, so we convert one column
	mutate(food_regime = factor(food_regime, levels = c("Initial", "Low", "High")))
	## 2.3 Plot ----
	ggplot(urchins,
	aes(x = initial_volume,
	y = width,
	group = food_regime,
	col = food_regime)) +
	geom_point() +
	geom_smooth(method = lm, se = FALSE) +
	scale_color_viridis_d(option = "plasma", end = .7)
	# 3.0 BUILD AND FIT A MODEL ----
	#A standard two-way analysis of variance (ANOVA) model makes sense for this
	#dataset because it contains a continuous and categorical predictor
	## 3.1 Designate model ----
	lm_mod <-
	linear_reg() %>%
	set_engine("lm")
	## 3.2 Train/fit/estimate model ----
	lm_fit <-
	lm_mod %>%
	fit(width ~ initial_volume * food_regime, data = urchins)
	## 3.3 print tidy ----
	tidy(lm_fit)
	## 3.4 plot results ----
	tidy(lm_fit) %>%
	dwplot(dot_args = list(size = 2, color = "black"),
	whisker_args = list(color = "black"),
	vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2))
	# 4.0 USE A MODEL TO PREDICT ----
	## 4.1 Create new points ----
	new_points <- expand.grid(initial_volume = 20,
	food_regime = c("Initial", "Low", "High"))
	## 4.2 Fit model to new data points ----
	mean_pred <- predict(lm_fit, new_data = new_points)
	## 4.3 Create confidence intervals for predictions ----
	conf_int_pred <- predict(lm_fit,
	new_data = new_points,
	type = "conf_int")
	## 4.4 New data: new points + estimates + conf int ----
	plot_data <-
	new_points %>%
	bind_cols(mean_pred) %>%
	bind_cols(conf_int_pred)
	## 4.5 And then plot . . . ----
	ggplot(plot_data, aes(x = food_regime)) +
	geom_point(aes(y = .pred)) +
	geom_errorbar(aes(ymin = .pred_lower,
	ymax = .pred_upper),
	width = .2) +
	labs(y = "urchin size")
	# 5.0 MODEL WITH A DIFFERENT ENGINE ----
	## 5.1 Set the prior distribution ----
	#couldn't get 'rstanarm' installed
	prior_dist <- rstanarm::student_t(df = 1)
	set.seed(123)
	## 5.2 Make the parsnip model ----
	bayes_mod <-
	linear_reg() %>%
	set_engine("stan",
	prior_intercept = prior_dist,
	prior = prior_dist)
	## 5.3 Train the model ----
	bayes_fit <-
	bayes_mod %>%
	fit(width ~ initial_volume * food_regime, data = urchins)
	## 5.4 Print the model ----
	print(bayes_fit, digits = 5)
	# 6.0 WHY DOES IT WORK THAT WAY? ----
	#the modeling code uses the pipe to pass around the model object:
	bayes_mod %>%
	fit(width ~ initial_volume * food_regime, data = urchins)