statwonk/clustering.R

## clustering.R
library(tidyverse)
library(lmtest)
library(sandwich)

5e2 -> students
20 -> schools

tibble(student_id = 1:students) %>%
  mutate(school_id = rep(1:schools, max(student_id) / schools)) %>%
  left_join(tibble(school_id = 1:schools, school_effect = rnorm(schools)),
            by = "school_id") %>%
  left_join(tibble(student_id = 1:students),
            by = "student_id") %>%
  mutate(treatment_effect = 0.5 * student_id %% 2,
         y = treatment_effect + school_effect) %>%
  lm(y ~ treatment_effect, data = .) -> m

coeftest(m) # not clustered
coeftest(m, vcovCL(m, cluster = ~ school_effect)) # clustered standard errors

# If we exhaust the population of schools, we don't need to cluster because we don't need to speak to schools not in our sample.
	library(tidyverse)
	library(lmtest)
	library(sandwich)

	5e2 -> students
	20 -> schools

	tibble(student_id = 1:students) %>%
	mutate(school_id = rep(1:schools, max(student_id) / schools)) %>%
	left_join(tibble(school_id = 1:schools, school_effect = rnorm(schools)),
	by = "school_id") %>%
	left_join(tibble(student_id = 1:students),
	by = "student_id") %>%
	mutate(treatment_effect = 0.5 * student_id %% 2,
	y = treatment_effect + school_effect) %>%
	lm(y ~ treatment_effect, data = .) -> m

	coeftest(m) # not clustered
	coeftest(m, vcovCL(m, cluster = ~ school_effect)) # clustered standard errors

	# If we exhaust the population of schools, we don't need to cluster because we don't need to speak to schools not in our sample.