library(tidyverse)
library(lme4) # For mixed models
library(broom)
library(broom.mixed) # Only necessary for models made with lmer()
library(knitr) # For kable()
# Build some models
model1 <- lm(hwy ~ displ, data = mpg)
model2 <- lm(hwy ~ displ + drv, data = mpg)
model3 <- glm(hwy ~ displ + drv, data = mpg, family = gaussian) # With gaussian, this is the same as lm(), just with glm()
model4 <- lmer(hwy ~ displ + (1 | drv), data = mpg) # Multilevel model with random intercepts for drv
# Massive walls of text :(
summary(model1)
#>
#> Call:
#> lm(formula = hwy ~ displ, data = mpg)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -7.1039 -2.1646 -0.2242 2.0589 15.0105
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 35.6977 0.7204 49.55 <2e-16 ***
#> displ -3.5306 0.1945 -18.15 <2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 3.836 on 232 degrees of freedom
#> Multiple R-squared: 0.5868, Adjusted R-squared: 0.585
#> F-statistic: 329.5 on 1 and 232 DF, p-value: < 2.2e-16
summary(model2)
#>
#> Call:
#> lm(formula = hwy ~ displ + drv, data = mpg)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -8.9996 -1.9066 -0.3937 1.5778 13.9207
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 30.8254 0.9239 33.364 < 2e-16 ***
#> displ -2.9141 0.2183 -13.352 < 2e-16 ***
#> drvf 4.7906 0.5296 9.045 < 2e-16 ***
#> drvr 5.2579 0.7336 7.167 1.03e-11 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 3.082 on 230 degrees of freedom
#> Multiple R-squared: 0.7356, Adjusted R-squared: 0.7322
#> F-statistic: 213.3 on 3 and 230 DF, p-value: < 2.2e-16
summary(model3)
#>
#> Call:
#> glm(formula = hwy ~ displ + drv, family = gaussian, data = mpg)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -8.9996 -1.9066 -0.3937 1.5778 13.9207
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 30.8254 0.9239 33.364 < 2e-16 ***
#> displ -2.9141 0.2183 -13.352 < 2e-16 ***
#> drvf 4.7906 0.5296 9.045 < 2e-16 ***
#> drvr 5.2579 0.7336 7.167 1.03e-11 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for gaussian family taken to be 9.496362)
#>
#> Null deviance: 8261.7 on 233 degrees of freedom
#> Residual deviance: 2184.2 on 230 degrees of freedom
#> AIC: 1196.7
#>
#> Number of Fisher Scoring iterations: 2
summary(model4)
#> Linear mixed model fit by REML ['lmerMod']
#> Formula: hwy ~ displ + (1 | drv)
#> Data: mpg
#>
#> REML criterion at convergence: 1199.3
#>
#> Scaled residuals:
#> Min 1Q Median 3Q Max
#> -2.9157 -0.6224 -0.1133 0.5008 4.5227
#>
#> Random effects:
#> Groups Name Variance Std.Dev.
#> drv (Intercept) 8.429 2.903
#> Residual 9.495 3.081
#> Number of obs: 234, groups: drv, 3
#>
#> Fixed effects:
#> Estimate Std. Error t value
#> (Intercept) 34.1521 1.8924 18.05
#> displ -2.9136 0.2162 -13.48
#>
#> Correlation of Fixed Effects:
#> (Intr)
#> displ -0.445
# Standardized tibbles with consistent column names!
tidy(model1)
#> # A tibble: 2 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 35.7 0.720 49.6 2.12e-125
#> 2 displ -3.53 0.195 -18.2 2.04e- 46
tidy(model2)
#> # A tibble: 4 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 30.8 0.924 33.4 4.21e-90
#> 2 displ -2.91 0.218 -13.4 1.73e-30
#> 3 drvf 4.79 0.530 9.05 6.40e-17
#> 4 drvr 5.26 0.734 7.17 1.03e-11
tidy(model3)
#> # A tibble: 4 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 30.8 0.924 33.4 4.21e-90
#> 2 displ -2.91 0.218 -13.4 1.73e-30
#> 3 drvf 4.79 0.530 9.05 6.40e-17
#> 4 drvr 5.26 0.734 7.17 1.03e-11
tidy(model4)
#> # A tibble: 4 x 6
#> effect group term estimate std.error statistic
#> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 fixed <NA> (Intercept) 34.2 1.89 18.0
#> 2 fixed <NA> displ -2.91 0.216 -13.5
#> 3 ran_pars drv sd__(Intercept) 2.90 NA NA
#> 4 ran_pars Residual sd__Observation 3.08 NA NA
# Show an individual table
model1 %>%
tidy() %>% # Convert model to a dataframe with tidy() first...
kable() # ...then show the dataframe as a nice table
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | 35.697651 | 0.7203676 | 49.55477 | 0 |
displ | -3.530589 | 0.1945137 | -18.15085 | 0 |
# Make a coefficient plot
model2_results <- tidy(model2, conf.int = TRUE) %>%
# The intercept is huge and messes up the graph and isn't really necessary anyway
filter(term != "(Intercept)")
model2_results
#> # A tibble: 3 x 7
#> term estimate std.error statistic p.value conf.low conf.high
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 displ -2.91 0.218 -13.4 1.73e-30 -3.34 -2.48
#> 2 drvf 4.79 0.530 9.05 6.40e-17 3.75 5.83
#> 3 drvr 5.26 0.734 7.17 1.03e-11 3.81 6.70
ggplot(model2_results, aes(x = estimate, y = term)) +
geom_pointrange(aes(xmin = conf.low, xmax = conf.high)) +
# Add a line at 0 to show if coefs are significantly different from 0
geom_vline(xintercept = 0, color = "red")
# SUPER FANCY ADVANCED COEFFICIENT PLOT
# Combine all the tidied regression results into one dataframe and plot the
# results from the 4 different models in one plot
# Tidy each model, get rid of the intercept, and add a new column for the model name
model1_results <- tidy(model1, conf.int = TRUE) %>%
filter(term != "(Intercept)") %>%
mutate(model_name = "Basic OLS")
model1_results
#> # A tibble: 1 x 8
#> term estimate std.error statistic p.value conf.low conf.high model_name
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 displ -3.53 0.195 -18.2 2.04e-46 -3.91 -3.15 Basic OLS
model2_results <- tidy(model2, conf.int = TRUE) %>%
filter(term != "(Intercept)") %>%
mutate(model_name = "Basic OLS + one control")
model2_results
#> # A tibble: 3 x 8
#> term estimate std.error statistic p.value conf.low conf.high model_name
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 displ -2.91 0.218 -13.4 1.73e-30 -3.34 -2.48 Basic OLS + on…
#> 2 drvf 4.79 0.530 9.05 6.40e-17 3.75 5.83 Basic OLS + on…
#> 3 drvr 5.26 0.734 7.17 1.03e-11 3.81 6.70 Basic OLS + on…
model3_results <- tidy(model3, conf.int = TRUE) %>%
filter(term != "(Intercept)") %>%
mutate(model_name = "Basic OLS but with glm()")
model3_results
#> # A tibble: 3 x 8
#> term estimate std.error statistic p.value conf.low conf.high model_name
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 displ -2.91 0.218 -13.4 1.73e-30 -3.34 -2.49 Basic OLS but …
#> 2 drvf 4.79 0.530 9.05 6.40e-17 3.75 5.83 Basic OLS but …
#> 3 drvr 5.26 0.734 7.17 1.03e-11 3.82 6.70 Basic OLS but …
model4_results <- tidy(model4, conf.int = TRUE, effects = "fixed") %>%
filter(term != "(Intercept)") %>%
mutate(model_name = "Mixed model with random intercepts for drv")
model4_results
#> # A tibble: 1 x 8
#> effect term estimate std.error statistic conf.low conf.high model_name
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 fixed displ -2.91 0.216 -13.5 -3.34 -2.49 Mixed model with…
# Combine these four datasets
models_to_plot <- bind_rows(model1_results, model2_results, model3_results, model4_results) %>%
# Make it so the model_name column uses this ordering instead of alphabetical ordering when plotting
mutate(model_name = fct_inorder(model_name))
# Plot everything, colored by model_name
ggplot(models_to_plot, aes(x = estimate, y = fct_rev(term), color = model_name)) +
# position_dodge() makes it so these lines don't overlap
geom_pointrange(aes(xmin = conf.low, xmax = conf.high),
position = position_dodge(width = 0.5)) +
# Add a line at 0 to show if coefs are significantly different from 0
geom_vline(xintercept = 0, color = "red")
Created on 2021-03-02 by the reprex package (v0.3.0)