Skip to content

Instantly share code, notes, and snippets.

@acoppock
Created September 16, 2018 14:33
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save acoppock/1e19e947cd114bf9dac847fa4a39513e to your computer and use it in GitHub Desktop.
Save acoppock/1e19e947cd114bf9dac847fa4a39513e to your computer and use it in GitHub Desktop.
Gelman and Hill Chapter 3 Using ggplot and estimatr
# Gelman and Hill 2007: Chapter 3 Regression Examples
# Using ggplot and estimatr
rm(list = ls())
# Uncomment to install
# install.packages("ggplot2")
# install.packges("haven")
# install.pacakges("estimatr")
library(ggplot2)
library(haven)
library(estimatr)
# Data and example are from gelman and hill (2007), a subsample from the National Longitudinal Survey of Youth.
# I downloaded from here http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/
kid <- read_dta("http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/kidiq.dta")
# Overriding goal is to describe the variable "kid_score" and how it might vary with mom_hs and mom_iq
# let's look at the variable itself
ggplot(kid, aes(kid_score)) + geom_histogram(bins = 30)
# what's a good single number summary?
# MSE demonstration -------------------------------------------------------
proposals <- seq(50, 150, by = 0.1)
mses <- rep(NA, length(proposals))
for (i in 1:length(proposals)){
mses[i] <- mean((proposals[i] - kid$kid_score)^2)
}
proposals[which.min(mses)]
# really close to
mean(kid$kid_score)
# which is to say, the mean is a "good" single number summary because it has
# the lowest mean squared error
# Setting up a predictions data.frame -------------------------------------
prediction_df <-
data.frame(mom_hs = c(0, 0, 1, 1),
mom_iq = c(71, 140, 71, 140))
# plot 1: kid_score ~ 1 ---------------------------------------------------
g_1 <- ggplot(kid, aes(y = kid_score, x = 1)) + geom_point()
g_1
# the regression
fit_1 <- lm_robust(kid_score ~ 1, data = kid)
fit_1
# comment: this is the same as the mean
prediction_df$pred_1 <- predict(fit_1, newdata = prediction_df)
# add the summary on top
g_1 + geom_point(data = prediction_df, aes(y = pred_1), size = 6, color = "red")
# plot 2: kid_score ~ mom_hs ----------------------------------------------
g_2 <- ggplot(kid, aes(y = kid_score, x = mom_hs)) + geom_point()
g_2
# the regression
fit_2 <- lm_robust(kid_score ~ mom_hs, data = kid)
fit_2
# comment: this is the MSE-minimizing two number summary, using mom_hs!
prediction_df$pred_2 <- predict(fit_2, newdata = prediction_df)
# add the summary on top
g_2 + geom_line(data = prediction_df, aes(y = pred_2), color = "red", size = 6)
# could also do this:
g_2 + stat_smooth(method = "lm_robust")
# Plot 3: kid_score ~ mom_iq ----------------------------------------------
g_3 <- ggplot(kid, aes(y = kid_score, x = mom_iq)) + geom_point()
g_3
# the regression
fit_3 <- lm_robust(kid_score ~ mom_iq, data = kid)
fit_3
# comment: the MSE-minimizing 2-number summary, using mom_iq
prediction_df$pred_3 <- predict(fit_3, newdata = prediction_df)
# add the summary on top
g_3 + geom_line(data = prediction_df, aes(y = pred_3), color = "red", size = 2)
# could also do
g_3 + stat_smooth(method = "lm_robust")
# Plot 4: kid_score ~ mom_hs + mom_iq -------------------------------------
g_4 <- ggplot(kid, aes(y = kid_score, x = mom_iq, color = mom_hs, group = mom_hs)) + geom_point()
g_4
# the regression
fit_4 <- lm_robust(kid_score ~ mom_hs + mom_iq, data = kid)
fit_4
prediction_df$pred_4 <- predict(fit_4, newdata = prediction_df)
g_4 + geom_line(data = prediction_df, aes(y = pred_4), size = 2)
# hard to use stat_smooth for this one...
# Plot 5: kid_score ~ mom_hs + mom_iq + mom_hs*mom_iq ---------------------
g_5 <- ggplot(kid, aes(y = kid_score, x = mom_iq, color = mom_hs, group = mom_hs)) + geom_point()
g_5
# the regression
fit_5 <- lm_robust(kid_score ~ mom_hs + mom_iq + mom_hs * mom_iq, data = kid)
fit_5
prediction_df$pred_5 <- predict(fit_5, newdata = prediction_df)
g_5 + geom_line(data = prediction_df, aes(y = pred_5), size = 2)
# can do stat_smooth here though...
g_5 + stat_smooth(method = "lm_robust", fullrange = TRUE)
# Summary -----------------------------------------------------------------
# We ran five models. Each makes *different* predictions for these four units:
prediction_df
# each of the predictions corresponds to a different descriptive summary
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment