acoppock/gelman_and_hill_chapter_3.R

## gelman_and_hill_chapter_3.R
# Gelman and Hill 2007: Chapter 3 Regression Examples
# Using ggplot and estimatr

rm(list = ls())
# Uncomment to install
# install.packages("ggplot2")
# install.packges("haven")
# install.pacakges("estimatr")

library(ggplot2)
library(haven)
library(estimatr)

# Data and example are from gelman and hill (2007), a subsample from the National Longitudinal Survey of Youth.
# I downloaded from here http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/
kid <- read_dta("http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/kidiq.dta")

# Overriding goal is to describe the variable "kid_score" and how it might vary with mom_hs and mom_iq
# let's look at the variable itself

ggplot(kid, aes(kid_score)) + geom_histogram(bins = 30)

# what's a good single number summary?

# MSE demonstration -------------------------------------------------------

proposals <- seq(50, 150, by = 0.1)
mses <- rep(NA, length(proposals))
for (i in 1:length(proposals)){
  mses[i] <- mean((proposals[i] - kid$kid_score)^2)
}

proposals[which.min(mses)]

# really close to
mean(kid$kid_score)

# which is to say, the mean is a "good" single number summary because it has
# the lowest mean squared error


# Setting up a predictions data.frame -------------------------------------

prediction_df <-
  data.frame(mom_hs = c(0, 0, 1, 1),
             mom_iq = c(71, 140, 71, 140))

# plot 1: kid_score ~ 1 ---------------------------------------------------

g_1 <- ggplot(kid, aes(y = kid_score, x = 1)) + geom_point()
g_1

# the regression
fit_1 <- lm_robust(kid_score ~ 1, data = kid)
fit_1

# comment: this is the same as the mean

prediction_df$pred_1 <- predict(fit_1, newdata = prediction_df)

# add the summary on top
g_1 + geom_point(data = prediction_df, aes(y = pred_1), size = 6, color = "red")

# plot 2: kid_score ~ mom_hs ----------------------------------------------

g_2 <- ggplot(kid, aes(y = kid_score, x = mom_hs)) + geom_point()
g_2

# the regression
fit_2 <- lm_robust(kid_score ~ mom_hs, data = kid)
fit_2

# comment: this is the MSE-minimizing two number summary, using mom_hs!

prediction_df$pred_2 <- predict(fit_2, newdata = prediction_df)

# add the summary on top
g_2 + geom_line(data = prediction_df, aes(y = pred_2), color = "red", size = 6)

# could also do this:
g_2 + stat_smooth(method = "lm_robust")

# Plot 3: kid_score ~ mom_iq ----------------------------------------------

g_3 <- ggplot(kid, aes(y = kid_score, x = mom_iq)) + geom_point()
g_3

# the regression
fit_3 <- lm_robust(kid_score ~ mom_iq, data = kid)
fit_3

# comment: the MSE-minimizing 2-number summary, using mom_iq

prediction_df$pred_3 <- predict(fit_3, newdata = prediction_df)

# add the summary on top
g_3 + geom_line(data = prediction_df, aes(y = pred_3), color = "red", size = 2)

# could also do
g_3 + stat_smooth(method = "lm_robust")

# Plot 4: kid_score ~ mom_hs + mom_iq -------------------------------------

g_4 <- ggplot(kid, aes(y = kid_score, x = mom_iq, color = mom_hs, group = mom_hs)) + geom_point()
g_4

# the regression
fit_4 <- lm_robust(kid_score ~ mom_hs + mom_iq, data = kid)
fit_4

prediction_df$pred_4 <- predict(fit_4, newdata = prediction_df)

g_4 + geom_line(data = prediction_df, aes(y = pred_4), size = 2)

# hard to use stat_smooth for this one...

# Plot 5: kid_score ~ mom_hs + mom_iq + mom_hs*mom_iq ---------------------

g_5 <- ggplot(kid, aes(y = kid_score, x = mom_iq, color = mom_hs, group = mom_hs)) + geom_point()
g_5

# the regression
fit_5 <- lm_robust(kid_score ~ mom_hs + mom_iq + mom_hs * mom_iq, data = kid)
fit_5

prediction_df$pred_5 <- predict(fit_5, newdata = prediction_df)

g_5 + geom_line(data = prediction_df, aes(y = pred_5), size = 2)

# can do stat_smooth here though...

g_5 + stat_smooth(method = "lm_robust", fullrange = TRUE)

# Summary -----------------------------------------------------------------

# We ran five models. Each makes *different* predictions for these four units:

prediction_df

# each of the predictions corresponds to a different descriptive summary
	# Gelman and Hill 2007: Chapter 3 Regression Examples
	# Using ggplot and estimatr

	rm(list = ls())
	# Uncomment to install
	# install.packages("ggplot2")
	# install.packges("haven")
	# install.pacakges("estimatr")

	library(ggplot2)
	library(haven)
	library(estimatr)

	# Data and example are from gelman and hill (2007), a subsample from the National Longitudinal Survey of Youth.
	# I downloaded from here http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/
	kid <- read_dta("http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/kidiq.dta")

	# Overriding goal is to describe the variable "kid_score" and how it might vary with mom_hs and mom_iq
	# let's look at the variable itself

	ggplot(kid, aes(kid_score)) + geom_histogram(bins = 30)

	# what's a good single number summary?

	# MSE demonstration -------------------------------------------------------

	proposals <- seq(50, 150, by = 0.1)
	mses <- rep(NA, length(proposals))
	for (i in 1:length(proposals)){
	mses[i] <- mean((proposals[i] - kid$kid_score)^2)
	}

	proposals[which.min(mses)]

	# really close to
	mean(kid$kid_score)

	# which is to say, the mean is a "good" single number summary because it has
	# the lowest mean squared error


	# Setting up a predictions data.frame -------------------------------------

	prediction_df <-
	data.frame(mom_hs = c(0, 0, 1, 1),
	mom_iq = c(71, 140, 71, 140))

	# plot 1: kid_score ~ 1 ---------------------------------------------------

	g_1 <- ggplot(kid, aes(y = kid_score, x = 1)) + geom_point()
	g_1

	# the regression
	fit_1 <- lm_robust(kid_score ~ 1, data = kid)
	fit_1

	# comment: this is the same as the mean

	prediction_df$pred_1 <- predict(fit_1, newdata = prediction_df)

	# add the summary on top
	g_1 + geom_point(data = prediction_df, aes(y = pred_1), size = 6, color = "red")

	# plot 2: kid_score ~ mom_hs ----------------------------------------------

	g_2 <- ggplot(kid, aes(y = kid_score, x = mom_hs)) + geom_point()
	g_2

	# the regression
	fit_2 <- lm_robust(kid_score ~ mom_hs, data = kid)
	fit_2

	# comment: this is the MSE-minimizing two number summary, using mom_hs!

	prediction_df$pred_2 <- predict(fit_2, newdata = prediction_df)

	# add the summary on top
	g_2 + geom_line(data = prediction_df, aes(y = pred_2), color = "red", size = 6)

	# could also do this:
	g_2 + stat_smooth(method = "lm_robust")

	# Plot 3: kid_score ~ mom_iq ----------------------------------------------

	g_3 <- ggplot(kid, aes(y = kid_score, x = mom_iq)) + geom_point()
	g_3

	# the regression
	fit_3 <- lm_robust(kid_score ~ mom_iq, data = kid)
	fit_3

	# comment: the MSE-minimizing 2-number summary, using mom_iq

	prediction_df$pred_3 <- predict(fit_3, newdata = prediction_df)

	# add the summary on top
	g_3 + geom_line(data = prediction_df, aes(y = pred_3), color = "red", size = 2)

	# could also do
	g_3 + stat_smooth(method = "lm_robust")

	# Plot 4: kid_score ~ mom_hs + mom_iq -------------------------------------

	g_4 <- ggplot(kid, aes(y = kid_score, x = mom_iq, color = mom_hs, group = mom_hs)) + geom_point()
	g_4

	# the regression
	fit_4 <- lm_robust(kid_score ~ mom_hs + mom_iq, data = kid)
	fit_4

	prediction_df$pred_4 <- predict(fit_4, newdata = prediction_df)

	g_4 + geom_line(data = prediction_df, aes(y = pred_4), size = 2)

	# hard to use stat_smooth for this one...

	# Plot 5: kid_score ~ mom_hs + mom_iq + mom_hs*mom_iq ---------------------

	g_5 <- ggplot(kid, aes(y = kid_score, x = mom_iq, color = mom_hs, group = mom_hs)) + geom_point()
	g_5

	# the regression
	fit_5 <- lm_robust(kid_score ~ mom_hs + mom_iq + mom_hs * mom_iq, data = kid)
	fit_5

	prediction_df$pred_5 <- predict(fit_5, newdata = prediction_df)

	g_5 + geom_line(data = prediction_df, aes(y = pred_5), size = 2)

	# can do stat_smooth here though...

	g_5 + stat_smooth(method = "lm_robust", fullrange = TRUE)

	# Summary -----------------------------------------------------------------

	# We ran five models. Each makes different predictions for these four units:

	prediction_df

	# each of the predictions corresponds to a different descriptive summary