jknowles/sdp_reg_exercise_1_example_2019.R

## sdp_reg_exercise_1_example_2019.R
###################################################################################################
## Title: Regression Module Assignment 1
## Exploring Student Composition Effects on Test Score Growth
## Author: Jared E. Knowles, Civilytics Consulting
## Date: 6/12/2019
## Last Updated: 6/17/2019
###################################################################################################

# ----------------------------------------------------------------------------
# Load the data
# You will need to change the path to point to your own version of the data file:
load("../guided tutorial/regression_1_r/data/full_sch_data.rda")


# ----------------------------------------------------------------------------
# Load any packages we will need to complete this project
# ----------------------------------------------------------------------------
library(ggplot2) # for plotting
library(dplyr) # for data manipulation
library(broom) # for working with statistical models in a tidy way
library(coefplot) # for making quick plots of regression coefficients
# ----------------------------------------------------------------------------
# Recreate original model
m1 <- lm(ss2 ~ ss1, data = sch_score)

coefplot(m1, intercept = FALSE)
# ----------------------------------------------------------------------------
# Select two additional variables
# I want look at role student with disabilities and free and reduced lunch
# populations play in explaining student test scores

m2 <- lm(ss2 ~ ss1 + swd_per + frl_per, data = sch_score)
coefplot(m2, intercept = FALSE)
# ----------------------------------------------------------------------------
# Note that including more variables reduces the number of available observations
# because of missingness.

nrow(m1$model) - nrow(m2$model)

# 193 school-grade-subject observations are lost

# Compare model fits
glance(m1)
glance(m2)

# Compare coefficients and estimates
tidy(m1)
tidy(m2)

# ----------------------------------------------------------------------------
# Diagnostic 1
# Residual versus fitted

# Let's compare residuals vs. fitted for m1 and m2 side by side

# Because some observations have missing values, to add them to the
# original data we need to use the "predict()" function

sch_score$m1_fitted <- predict(m1, newdata = sch_score)
sch_score$m2_fitted <- predict(m2, newdata = sch_score)
sch_score$m1_resid <- sch_score$ss2 - sch_score$m1_fitted
sch_score$m2_resid <- sch_score$ss2 - sch_score$m2_fitted

ggplot(sch_score) + aes(x = m1_fitted, y = m1_resid) +
  geom_point(alpha = I(0.1)) +
  geom_smooth(method = "lm") + theme_bw() +
  labs(title = "Residual Plot for Original Model")


ggplot(sch_score) + aes(x = m2_fitted, y = m2_resid) +
  geom_point(alpha = I(0.1)) +
  geom_smooth(method = "lm") + theme_bw() +
  labs(title = "Residual Plot for Proposed Model")

# Plot residuals and their variation by class size
# Residuals by class size

ggplot(sch_score, aes(x = n1, y = m1_resid)) +
  geom_point(alpha = I(0.1)) +
  geom_smooth() +
  theme_bw() +
  labs(title = "Model Residuals Plotted Against Class Size")

ggplot(sch_score, aes(x = n1, y = m2_resid)) +
  geom_point(alpha = I(0.1)) +
  geom_smooth() +
  theme_bw() +
  labs(title = "Model Residuals Plotted Against Class Size")

table("overperform" = sch_score$std_resid >= 2, "under 100" = sch_score$n1 < 100)

sch_score$std_resid <- rstandard(m1)
ggplot(sch_score) + aes(x = yhat, y = std_resid) +
  geom_point(alpha = I(0.1)) +
  geom_smooth() + geom_smooth(method = "lm") + theme_bw() +
  labs(title = "Residual Plot for Proposed Model")


# ----------------------------------------------------------------------------
# Let's look for constant or non-constant variance in residuals by decile of fitted value

sch_score$bin <- dplyr::ntile(sch_score$m2_fitted, 10)
sch_score %>% group_by(bin) %>%
  summarize(error_var = var(m2_resid),
            error_sd = sd(m2_resid))

sch_score$bin <- dplyr::ntile(sch_score$m1_fitted, 10)
sch_score %>% group_by(bin) %>%
  summarize(error_var = var(m1_resid),
            error_sd = sd(m1_resid))


# ----------------------------------------------------------------------------
# Standard residuals
# Let's evaluate which schools would stand out as exceptional high or low
# performers using standardized residuals

sch_score$m1_std_resid <- rstandard(m1)
# Manually compute the standardized residual because of missing values
sch_score$m2_std_resid <- sch_score$m2_resid / sd(sch_score$m2_resid, na.rm = TRUE)

# Compare original model residuals with new model standardized residuals
# Look at which schools are standouts
table(sch_score$m1_std_resid >= 2, sch_score$test_year)
table(sch_score$m2_std_resid >= 2, sch_score$test_year)

table(sch_score$m1_std_resid >= 2, sch_score$grade)
table(sch_score$m2_std_resid >= 2, sch_score$grade)

table(sch_score$m1_std_resid >= 2, sch_score$subject)
table(sch_score$m2_std_resid >= 2, sch_score$subject)

# We still are more likely to detect changes in lower grade, more likely to detect changes in
# math and more likely to detect schools in 2007 and 2009


#-----------------------------------------------------------------------------
# outlier values
# Let's look at where outliers are on our plot

sch_score$flag_x <- ifelse(abs(as.numeric(scale(sch_score$ss1))) >= 3, "Y", "N")
sch_score$flag_y <- ifelse(abs(as.numeric(scale(sch_score$ss2))) >= 3, "Y", "N")
table(x_outlier = sch_score$flag_x, y_outlier = sch_score$flag_y)

sch_score$outlier_flag <- ifelse(sch_score$flag_x == "Y" |
                                   sch_score$flag_y == "Y", "Yes", "No")

ggplot(sch_score, aes(x = ss1, y = ss2, color = outlier_flag)) +
  geom_point(alpha = I(0.4)) +
  scale_color_brewer(type = "qual", palette = 2) +
  geom_smooth(aes(group = 1), method = "lm", linetype = 2, se=FALSE) +
  theme_bw()


# ----------------------------------------------------------------------------
# Look at largest district compared to other districts
# m_bos - without largest district

# Fit a model only to districts that are not the largest district
m2_bos <- lm(formula(m2), # we can pass sample restriction to the data argument
             data = sch_score[sch_score$distid != 174, ])

# m_174 - only largest district
m2_174 <- lm(formula(m2),
             data = sch_score[sch_score$distid == 174, ])

# Review coefficients
coef(m2_bos)
coef(m2_174)

# Create a variable with predictions from the largest district using the
# model fit to that subsample.
# For districts that are not the largest district, use the predictions from the
# model fit to that sample instead.
sch_score$m2_bos_dist[sch_score$distid == 174] <-
  predict(m2_174, newdata = sch_score[sch_score$distid == 174,])
sch_score$m2_bos_dist[sch_score$distid != 174] <-
  predict(m2_bos, newdata = sch_score[sch_score$distid != 174,])

# For plotting, create a flag for each observation indicating whether it is
# the biggest district or not.
sch_score$bos_flag <- ifelse(sch_score$distid == 174, "No", "Yes")

# Plot
ggplot(sch_score, aes(x = ss1, y = ss2)) +
  geom_point(alpha = I(0.4), color = "cadetblue3") +
  scale_color_brewer(type = "qual", palette = 2) +
  geom_smooth(aes(y = m2_bos_dist, color = bos_flag, group = bos_flag),
              linetype = 2, method = "lm") +
  labs(title = "Comparing District 174 Model to Balance of State",
       subtitle = "Analyzing Fitted values from M2 on Subsamples",
       caption = "Model represented by linear average of predictions.",
       x = "Pre-test", y = "Post-test", color = "Balance of State") +
  theme_bw()
	###################################################################################################
	## Title: Regression Module Assignment 1
	## Exploring Student Composition Effects on Test Score Growth
	## Author: Jared E. Knowles, Civilytics Consulting
	## Date: 6/12/2019
	## Last Updated: 6/17/2019
	###################################################################################################

	# ----------------------------------------------------------------------------
	# Load the data
	# You will need to change the path to point to your own version of the data file:
	load("../guided tutorial/regression_1_r/data/full_sch_data.rda")


	# ----------------------------------------------------------------------------
	# Load any packages we will need to complete this project
	# ----------------------------------------------------------------------------
	library(ggplot2) # for plotting
	library(dplyr) # for data manipulation
	library(broom) # for working with statistical models in a tidy way
	library(coefplot) # for making quick plots of regression coefficients
	# ----------------------------------------------------------------------------
	# Recreate original model
	m1 <- lm(ss2 ~ ss1, data = sch_score)

	coefplot(m1, intercept = FALSE)
	# ----------------------------------------------------------------------------
	# Select two additional variables
	# I want look at role student with disabilities and free and reduced lunch
	# populations play in explaining student test scores

	m2 <- lm(ss2 ~ ss1 + swd_per + frl_per, data = sch_score)
	coefplot(m2, intercept = FALSE)
	# ----------------------------------------------------------------------------
	# Note that including more variables reduces the number of available observations
	# because of missingness.

	nrow(m1$model) - nrow(m2$model)

	# 193 school-grade-subject observations are lost

	# Compare model fits
	glance(m1)
	glance(m2)

	# Compare coefficients and estimates
	tidy(m1)
	tidy(m2)

	# ----------------------------------------------------------------------------
	# Diagnostic 1
	# Residual versus fitted

	# Let's compare residuals vs. fitted for m1 and m2 side by side

	# Because some observations have missing values, to add them to the
	# original data we need to use the "predict()" function

	sch_score$m1_fitted <- predict(m1, newdata = sch_score)
	sch_score$m2_fitted <- predict(m2, newdata = sch_score)
	sch_score$m1_resid <- sch_score$ss2 - sch_score$m1_fitted
	sch_score$m2_resid <- sch_score$ss2 - sch_score$m2_fitted

	ggplot(sch_score) + aes(x = m1_fitted, y = m1_resid) +
	geom_point(alpha = I(0.1)) +
	geom_smooth(method = "lm") + theme_bw() +
	labs(title = "Residual Plot for Original Model")


	ggplot(sch_score) + aes(x = m2_fitted, y = m2_resid) +
	geom_point(alpha = I(0.1)) +
	geom_smooth(method = "lm") + theme_bw() +
	labs(title = "Residual Plot for Proposed Model")

	# Plot residuals and their variation by class size
	# Residuals by class size

	ggplot(sch_score, aes(x = n1, y = m1_resid)) +
	geom_point(alpha = I(0.1)) +
	geom_smooth() +
	theme_bw() +
	labs(title = "Model Residuals Plotted Against Class Size")

	ggplot(sch_score, aes(x = n1, y = m2_resid)) +
	geom_point(alpha = I(0.1)) +
	geom_smooth() +
	theme_bw() +
	labs(title = "Model Residuals Plotted Against Class Size")

	table("overperform" = sch_score$std_resid >= 2, "under 100" = sch_score$n1 < 100)

	sch_score$std_resid <- rstandard(m1)
	ggplot(sch_score) + aes(x = yhat, y = std_resid) +
	geom_point(alpha = I(0.1)) +
	geom_smooth() + geom_smooth(method = "lm") + theme_bw() +
	labs(title = "Residual Plot for Proposed Model")



	# ----------------------------------------------------------------------------
	# Let's look for constant or non-constant variance in residuals by decile of fitted value

	sch_score$bin <- dplyr::ntile(sch_score$m2_fitted, 10)
	sch_score %>% group_by(bin) %>%
	summarize(error_var = var(m2_resid),
	error_sd = sd(m2_resid))

	sch_score$bin <- dplyr::ntile(sch_score$m1_fitted, 10)
	sch_score %>% group_by(bin) %>%
	summarize(error_var = var(m1_resid),
	error_sd = sd(m1_resid))


	# ----------------------------------------------------------------------------
	# Standard residuals
	# Let's evaluate which schools would stand out as exceptional high or low
	# performers using standardized residuals

	sch_score$m1_std_resid <- rstandard(m1)
	# Manually compute the standardized residual because of missing values
	sch_score$m2_std_resid <- sch_score$m2_resid / sd(sch_score$m2_resid, na.rm = TRUE)

	# Compare original model residuals with new model standardized residuals
	# Look at which schools are standouts
	table(sch_score$m1_std_resid >= 2, sch_score$test_year)
	table(sch_score$m2_std_resid >= 2, sch_score$test_year)

	table(sch_score$m1_std_resid >= 2, sch_score$grade)
	table(sch_score$m2_std_resid >= 2, sch_score$grade)

	table(sch_score$m1_std_resid >= 2, sch_score$subject)
	table(sch_score$m2_std_resid >= 2, sch_score$subject)

	# We still are more likely to detect changes in lower grade, more likely to detect changes in
	# math and more likely to detect schools in 2007 and 2009


	#-----------------------------------------------------------------------------
	# outlier values
	# Let's look at where outliers are on our plot

	sch_score$flag_x <- ifelse(abs(as.numeric(scale(sch_score$ss1))) >= 3, "Y", "N")
	sch_score$flag_y <- ifelse(abs(as.numeric(scale(sch_score$ss2))) >= 3, "Y", "N")
	table(x_outlier = sch_score$flag_x, y_outlier = sch_score$flag_y)

	sch_score$outlier_flag <- ifelse(sch_score$flag_x == "Y" \|
	sch_score$flag_y == "Y", "Yes", "No")

	ggplot(sch_score, aes(x = ss1, y = ss2, color = outlier_flag)) +
	geom_point(alpha = I(0.4)) +
	scale_color_brewer(type = "qual", palette = 2) +
	geom_smooth(aes(group = 1), method = "lm", linetype = 2, se=FALSE) +
	theme_bw()


	# ----------------------------------------------------------------------------
	# Look at largest district compared to other districts
	# m_bos - without largest district

	# Fit a model only to districts that are not the largest district
	m2_bos <- lm(formula(m2), # we can pass sample restriction to the data argument
	data = sch_score[sch_score$distid != 174, ])

	# m_174 - only largest district
	m2_174 <- lm(formula(m2),
	data = sch_score[sch_score$distid == 174, ])

	# Review coefficients
	coef(m2_bos)
	coef(m2_174)

	# Create a variable with predictions from the largest district using the
	# model fit to that subsample.
	# For districts that are not the largest district, use the predictions from the
	# model fit to that sample instead.
	sch_score$m2_bos_dist[sch_score$distid == 174] <-
	predict(m2_174, newdata = sch_score[sch_score$distid == 174,])
	sch_score$m2_bos_dist[sch_score$distid != 174] <-
	predict(m2_bos, newdata = sch_score[sch_score$distid != 174,])

	# For plotting, create a flag for each observation indicating whether it is
	# the biggest district or not.
	sch_score$bos_flag <- ifelse(sch_score$distid == 174, "No", "Yes")

	# Plot
	ggplot(sch_score, aes(x = ss1, y = ss2)) +
	geom_point(alpha = I(0.4), color = "cadetblue3") +
	scale_color_brewer(type = "qual", palette = 2) +
	geom_smooth(aes(y = m2_bos_dist, color = bos_flag, group = bos_flag),
	linetype = 2, method = "lm") +
	labs(title = "Comparing District 174 Model to Balance of State",
	subtitle = "Analyzing Fitted values from M2 on Subsamples",
	caption = "Model represented by linear average of predictions.",
	x = "Pre-test", y = "Post-test", color = "Balance of State") +
	theme_bw()