Skip to content

Instantly share code, notes, and snippets.

@jknowles
Created June 17, 2019 15:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jknowles/bf0e30fbe46119809b3b7c3bc2979f4f to your computer and use it in GitHub Desktop.
Save jknowles/bf0e30fbe46119809b3b7c3bc2979f4f to your computer and use it in GitHub Desktop.
Exploring student composition effects on test score growth of school average scores. Example for SDP 2019 Regression course.
###################################################################################################
## Title: Regression Module Assignment 1
## Exploring Student Composition Effects on Test Score Growth
## Author: Jared E. Knowles, Civilytics Consulting
## Date: 6/12/2019
## Last Updated: 6/17/2019
###################################################################################################
# ----------------------------------------------------------------------------
# Load the data
# You will need to change the path to point to your own version of the data file:
load("../guided tutorial/regression_1_r/data/full_sch_data.rda")
# ----------------------------------------------------------------------------
# Load any packages we will need to complete this project
# ----------------------------------------------------------------------------
library(ggplot2) # for plotting
library(dplyr) # for data manipulation
library(broom) # for working with statistical models in a tidy way
library(coefplot) # for making quick plots of regression coefficients
# ----------------------------------------------------------------------------
# Recreate original model
m1 <- lm(ss2 ~ ss1, data = sch_score)
coefplot(m1, intercept = FALSE)
# ----------------------------------------------------------------------------
# Select two additional variables
# I want look at role student with disabilities and free and reduced lunch
# populations play in explaining student test scores
m2 <- lm(ss2 ~ ss1 + swd_per + frl_per, data = sch_score)
coefplot(m2, intercept = FALSE)
# ----------------------------------------------------------------------------
# Note that including more variables reduces the number of available observations
# because of missingness.
nrow(m1$model) - nrow(m2$model)
# 193 school-grade-subject observations are lost
# Compare model fits
glance(m1)
glance(m2)
# Compare coefficients and estimates
tidy(m1)
tidy(m2)
# ----------------------------------------------------------------------------
# Diagnostic 1
# Residual versus fitted
# Let's compare residuals vs. fitted for m1 and m2 side by side
# Because some observations have missing values, to add them to the
# original data we need to use the "predict()" function
sch_score$m1_fitted <- predict(m1, newdata = sch_score)
sch_score$m2_fitted <- predict(m2, newdata = sch_score)
sch_score$m1_resid <- sch_score$ss2 - sch_score$m1_fitted
sch_score$m2_resid <- sch_score$ss2 - sch_score$m2_fitted
ggplot(sch_score) + aes(x = m1_fitted, y = m1_resid) +
geom_point(alpha = I(0.1)) +
geom_smooth(method = "lm") + theme_bw() +
labs(title = "Residual Plot for Original Model")
ggplot(sch_score) + aes(x = m2_fitted, y = m2_resid) +
geom_point(alpha = I(0.1)) +
geom_smooth(method = "lm") + theme_bw() +
labs(title = "Residual Plot for Proposed Model")
# Plot residuals and their variation by class size
# Residuals by class size
ggplot(sch_score, aes(x = n1, y = m1_resid)) +
geom_point(alpha = I(0.1)) +
geom_smooth() +
theme_bw() +
labs(title = "Model Residuals Plotted Against Class Size")
ggplot(sch_score, aes(x = n1, y = m2_resid)) +
geom_point(alpha = I(0.1)) +
geom_smooth() +
theme_bw() +
labs(title = "Model Residuals Plotted Against Class Size")
table("overperform" = sch_score$std_resid >= 2, "under 100" = sch_score$n1 < 100)
sch_score$std_resid <- rstandard(m1)
ggplot(sch_score) + aes(x = yhat, y = std_resid) +
geom_point(alpha = I(0.1)) +
geom_smooth() + geom_smooth(method = "lm") + theme_bw() +
labs(title = "Residual Plot for Proposed Model")
# ----------------------------------------------------------------------------
# Let's look for constant or non-constant variance in residuals by decile of fitted value
sch_score$bin <- dplyr::ntile(sch_score$m2_fitted, 10)
sch_score %>% group_by(bin) %>%
summarize(error_var = var(m2_resid),
error_sd = sd(m2_resid))
sch_score$bin <- dplyr::ntile(sch_score$m1_fitted, 10)
sch_score %>% group_by(bin) %>%
summarize(error_var = var(m1_resid),
error_sd = sd(m1_resid))
# ----------------------------------------------------------------------------
# Standard residuals
# Let's evaluate which schools would stand out as exceptional high or low
# performers using standardized residuals
sch_score$m1_std_resid <- rstandard(m1)
# Manually compute the standardized residual because of missing values
sch_score$m2_std_resid <- sch_score$m2_resid / sd(sch_score$m2_resid, na.rm = TRUE)
# Compare original model residuals with new model standardized residuals
# Look at which schools are standouts
table(sch_score$m1_std_resid >= 2, sch_score$test_year)
table(sch_score$m2_std_resid >= 2, sch_score$test_year)
table(sch_score$m1_std_resid >= 2, sch_score$grade)
table(sch_score$m2_std_resid >= 2, sch_score$grade)
table(sch_score$m1_std_resid >= 2, sch_score$subject)
table(sch_score$m2_std_resid >= 2, sch_score$subject)
# We still are more likely to detect changes in lower grade, more likely to detect changes in
# math and more likely to detect schools in 2007 and 2009
#-----------------------------------------------------------------------------
# outlier values
# Let's look at where outliers are on our plot
sch_score$flag_x <- ifelse(abs(as.numeric(scale(sch_score$ss1))) >= 3, "Y", "N")
sch_score$flag_y <- ifelse(abs(as.numeric(scale(sch_score$ss2))) >= 3, "Y", "N")
table(x_outlier = sch_score$flag_x, y_outlier = sch_score$flag_y)
sch_score$outlier_flag <- ifelse(sch_score$flag_x == "Y" |
sch_score$flag_y == "Y", "Yes", "No")
ggplot(sch_score, aes(x = ss1, y = ss2, color = outlier_flag)) +
geom_point(alpha = I(0.4)) +
scale_color_brewer(type = "qual", palette = 2) +
geom_smooth(aes(group = 1), method = "lm", linetype = 2, se=FALSE) +
theme_bw()
# ----------------------------------------------------------------------------
# Look at largest district compared to other districts
# m_bos - without largest district
# Fit a model only to districts that are not the largest district
m2_bos <- lm(formula(m2), # we can pass sample restriction to the data argument
data = sch_score[sch_score$distid != 174, ])
# m_174 - only largest district
m2_174 <- lm(formula(m2),
data = sch_score[sch_score$distid == 174, ])
# Review coefficients
coef(m2_bos)
coef(m2_174)
# Create a variable with predictions from the largest district using the
# model fit to that subsample.
# For districts that are not the largest district, use the predictions from the
# model fit to that sample instead.
sch_score$m2_bos_dist[sch_score$distid == 174] <-
predict(m2_174, newdata = sch_score[sch_score$distid == 174,])
sch_score$m2_bos_dist[sch_score$distid != 174] <-
predict(m2_bos, newdata = sch_score[sch_score$distid != 174,])
# For plotting, create a flag for each observation indicating whether it is
# the biggest district or not.
sch_score$bos_flag <- ifelse(sch_score$distid == 174, "No", "Yes")
# Plot
ggplot(sch_score, aes(x = ss1, y = ss2)) +
geom_point(alpha = I(0.4), color = "cadetblue3") +
scale_color_brewer(type = "qual", palette = 2) +
geom_smooth(aes(y = m2_bos_dist, color = bos_flag, group = bos_flag),
linetype = 2, method = "lm") +
labs(title = "Comparing District 174 Model to Balance of State",
subtitle = "Analyzing Fitted values from M2 on Subsamples",
caption = "Model represented by linear average of predictions.",
x = "Pre-test", y = "Post-test", color = "Balance of State") +
theme_bw()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment