andrewheiss/tidy_magic.md

## tidy_magic.md

      
    Raw
  

              tidy_magic.md
            
          
    library(tidyverse)
library(lme4)  # For mixed models
library(broom)
library(broom.mixed)  # Only necessary for models made with lmer()
library(knitr)  # For kable()

# Build some models
model1 <- lm(hwy ~ displ, data = mpg)
model2 <- lm(hwy ~ displ + drv, data = mpg)
model3 <- glm(hwy ~ displ + drv, data = mpg, family = gaussian)  # With gaussian, this is the same as lm(), just with glm()
model4 <- lmer(hwy ~ displ + (1 | drv), data = mpg)  # Multilevel model with random intercepts for drv

# Massive walls of text :(
summary(model1)
#> 
#> Call:
#> lm(formula = hwy ~ displ, data = mpg)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -7.1039 -2.1646 -0.2242  2.0589 15.0105 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  35.6977     0.7204   49.55   <2e-16 ***
#> displ        -3.5306     0.1945  -18.15   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 3.836 on 232 degrees of freedom
#> Multiple R-squared:  0.5868, Adjusted R-squared:  0.585 
#> F-statistic: 329.5 on 1 and 232 DF,  p-value: < 2.2e-16
summary(model2)
#> 
#> Call:
#> lm(formula = hwy ~ displ + drv, data = mpg)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -8.9996 -1.9066 -0.3937  1.5778 13.9207 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  30.8254     0.9239  33.364  < 2e-16 ***
#> displ        -2.9141     0.2183 -13.352  < 2e-16 ***
#> drvf          4.7906     0.5296   9.045  < 2e-16 ***
#> drvr          5.2579     0.7336   7.167 1.03e-11 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 3.082 on 230 degrees of freedom
#> Multiple R-squared:  0.7356, Adjusted R-squared:  0.7322 
#> F-statistic: 213.3 on 3 and 230 DF,  p-value: < 2.2e-16
summary(model3)
#> 
#> Call:
#> glm(formula = hwy ~ displ + drv, family = gaussian, data = mpg)
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -8.9996  -1.9066  -0.3937   1.5778  13.9207  
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  30.8254     0.9239  33.364  < 2e-16 ***
#> displ        -2.9141     0.2183 -13.352  < 2e-16 ***
#> drvf          4.7906     0.5296   9.045  < 2e-16 ***
#> drvr          5.2579     0.7336   7.167 1.03e-11 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for gaussian family taken to be 9.496362)
#> 
#>     Null deviance: 8261.7  on 233  degrees of freedom
#> Residual deviance: 2184.2  on 230  degrees of freedom
#> AIC: 1196.7
#> 
#> Number of Fisher Scoring iterations: 2
summary(model4)
#> Linear mixed model fit by REML ['lmerMod']
#> Formula: hwy ~ displ + (1 | drv)
#>    Data: mpg
#> 
#> REML criterion at convergence: 1199.3
#> 
#> Scaled residuals: 
#>     Min      1Q  Median      3Q     Max 
#> -2.9157 -0.6224 -0.1133  0.5008  4.5227 
#> 
#> Random effects:
#>  Groups   Name        Variance Std.Dev.
#>  drv      (Intercept) 8.429    2.903   
#>  Residual             9.495    3.081   
#> Number of obs: 234, groups:  drv, 3
#> 
#> Fixed effects:
#>             Estimate Std. Error t value
#> (Intercept)  34.1521     1.8924   18.05
#> displ        -2.9136     0.2162  -13.48
#> 
#> Correlation of Fixed Effects:
#>       (Intr)
#> displ -0.445

# Standardized tibbles with consistent column names!
tidy(model1)
#> # A tibble: 2 x 5
#>   term        estimate std.error statistic   p.value
#>   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
#> 1 (Intercept)    35.7      0.720      49.6 2.12e-125
#> 2 displ          -3.53     0.195     -18.2 2.04e- 46
tidy(model2)
#> # A tibble: 4 x 5
#>   term        estimate std.error statistic  p.value
#>   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
#> 1 (Intercept)    30.8      0.924     33.4  4.21e-90
#> 2 displ          -2.91     0.218    -13.4  1.73e-30
#> 3 drvf            4.79     0.530      9.05 6.40e-17
#> 4 drvr            5.26     0.734      7.17 1.03e-11
tidy(model3)
#> # A tibble: 4 x 5
#>   term        estimate std.error statistic  p.value
#>   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
#> 1 (Intercept)    30.8      0.924     33.4  4.21e-90
#> 2 displ          -2.91     0.218    -13.4  1.73e-30
#> 3 drvf            4.79     0.530      9.05 6.40e-17
#> 4 drvr            5.26     0.734      7.17 1.03e-11
tidy(model4)
#> # A tibble: 4 x 6
#>   effect   group    term            estimate std.error statistic
#>   <chr>    <chr>    <chr>              <dbl>     <dbl>     <dbl>
#> 1 fixed    <NA>     (Intercept)        34.2      1.89       18.0
#> 2 fixed    <NA>     displ              -2.91     0.216     -13.5
#> 3 ran_pars drv      sd__(Intercept)     2.90    NA          NA  
#> 4 ran_pars Residual sd__Observation     3.08    NA          NA

# Show an individual table
model1 %>% 
  tidy() %>%  # Convert model to a dataframe with tidy() first...
  kable()  # ...then show the dataframe as a nice table


term
estimate
std.error
statistic
p.value


(Intercept)
35.697651
0.7203676
49.55477
0


displ
-3.530589
0.1945137
-18.15085
0


# Make a coefficient plot
model2_results <- tidy(model2, conf.int = TRUE) %>% 
  # The intercept is huge and messes up the graph and isn't really necessary anyway
  filter(term != "(Intercept)")
model2_results
#> # A tibble: 3 x 7
#>   term  estimate std.error statistic  p.value conf.low conf.high
#>   <chr>    <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
#> 1 displ    -2.91     0.218    -13.4  1.73e-30    -3.34     -2.48
#> 2 drvf      4.79     0.530      9.05 6.40e-17     3.75      5.83
#> 3 drvr      5.26     0.734      7.17 1.03e-11     3.81      6.70

ggplot(model2_results, aes(x = estimate, y = term)) +
  geom_pointrange(aes(xmin = conf.low, xmax = conf.high)) +
  # Add a line at 0 to show if coefs are significantly different from 0
  geom_vline(xintercept = 0, color = "red")

# SUPER FANCY ADVANCED COEFFICIENT PLOT
# Combine all the tidied regression results into one dataframe and plot the
# results from the 4 different models in one plot

# Tidy each model, get rid of the intercept, and add a new column for the model name
model1_results <- tidy(model1, conf.int = TRUE) %>% 
  filter(term != "(Intercept)") %>% 
  mutate(model_name = "Basic OLS")
model1_results
#> # A tibble: 1 x 8
#>   term  estimate std.error statistic  p.value conf.low conf.high model_name
#>   <chr>    <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl> <chr>     
#> 1 displ    -3.53     0.195     -18.2 2.04e-46    -3.91     -3.15 Basic OLS

model2_results <- tidy(model2, conf.int = TRUE) %>% 
  filter(term != "(Intercept)") %>% 
  mutate(model_name = "Basic OLS + one control")
model2_results
#> # A tibble: 3 x 8
#>   term  estimate std.error statistic  p.value conf.low conf.high model_name     
#>   <chr>    <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl> <chr>          
#> 1 displ    -2.91     0.218    -13.4  1.73e-30    -3.34     -2.48 Basic OLS + on…
#> 2 drvf      4.79     0.530      9.05 6.40e-17     3.75      5.83 Basic OLS + on…
#> 3 drvr      5.26     0.734      7.17 1.03e-11     3.81      6.70 Basic OLS + on…

model3_results <- tidy(model3, conf.int = TRUE) %>% 
  filter(term != "(Intercept)") %>% 
  mutate(model_name = "Basic OLS but with glm()")
model3_results
#> # A tibble: 3 x 8
#>   term  estimate std.error statistic  p.value conf.low conf.high model_name     
#>   <chr>    <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl> <chr>          
#> 1 displ    -2.91     0.218    -13.4  1.73e-30    -3.34     -2.49 Basic OLS but …
#> 2 drvf      4.79     0.530      9.05 6.40e-17     3.75      5.83 Basic OLS but …
#> 3 drvr      5.26     0.734      7.17 1.03e-11     3.82      6.70 Basic OLS but …

model4_results <- tidy(model4, conf.int = TRUE, effects = "fixed") %>% 
  filter(term != "(Intercept)") %>% 
  mutate(model_name = "Mixed model with random intercepts for drv")
model4_results
#> # A tibble: 1 x 8
#>   effect term  estimate std.error statistic conf.low conf.high model_name       
#>   <chr>  <chr>    <dbl>     <dbl>     <dbl>    <dbl>     <dbl> <chr>            
#> 1 fixed  displ    -2.91     0.216     -13.5    -3.34     -2.49 Mixed model with…

# Combine these four datasets
models_to_plot <- bind_rows(model1_results, model2_results, model3_results, model4_results) %>% 
  # Make it so the model_name column uses this ordering instead of alphabetical ordering when plotting
  mutate(model_name = fct_inorder(model_name))

# Plot everything, colored by model_name
ggplot(models_to_plot, aes(x = estimate, y = fct_rev(term), color = model_name)) +
  # position_dodge() makes it so these lines don't overlap
  geom_pointrange(aes(xmin = conf.low, xmax = conf.high),
                  position = position_dodge(width = 0.5)) +
  # Add a line at 0 to show if coefs are significantly different from 0
  geom_vline(xintercept = 0, color = "red")

^{Created on 2021-03-02 by the reprex package (v0.3.0)}
term	estimate	std.error	statistic	p.value
(Intercept)	35.697651	0.7203676	49.55477	0
displ	-3.530589	0.1945137	-18.15085	0