Skip to content

Instantly share code, notes, and snippets.

  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save pjastam/14cfd2a60b0787239ed6a2f18c3ec03b to your computer and use it in GitHub Desktop.
Regression Analysis of Aggregate Data in R
## -----------------------------------------------------------------------------------------------------------------------------
library(readtext)
library(dplyr)
data <- readtext("https://cdn-links.lww.com/permalink/ede/a/ede_25_6_2014_07_23_urquia_ede14-408_sdc1.doc") %>%
subset(select = "text") %>%
{ read.table(text=sub(".*4) individual-level dataset", "", .), header=TRUE) }
## -----------------------------------------------------------------------------------------------------------------------------
db_indiv <- data %>%
rename_all(tolower) %>%
mutate(
wtgaink = wtgain * 0.453592,
wic = as.factor(wic),
mracethn = as.factor(mracethn),
mracethn = relevel(mracethn, ref = 3),
latecare = as.factor(latecare)
) %>%
select(wtgaink, wic, mracethn, latecare)
## -----------------------------------------------------------------------------------------------------------------------------
model_indiv <-
lm(
formula = wtgaink ~ wic + mracethn + latecare,
data = db_indiv
)
summ_indiv <- summary(model_indiv)
summ_indiv
## -----------------------------------------------------------------------------------------------------------------------------
db_aggr <-
db_indiv %>%
group_by(across(c(wic, mracethn, latecare))) %>%
summarise(
meany = mean(wtgaink),
stdy = sd(wtgaink),
N = length(wtgaink)
) %>%
ungroup()
## -----------------------------------------------------------------------------------------------------------------------------
model_aggr <-
lm(
formula = meany ~ wic + mracethn + latecare,
data = db_aggr,
weights = N
)
summ_aggr <- summary(model_aggr)
summ_aggr
## -----------------------------------------------------------------------------------------------------------------------------
all.equal(coef(model_indiv),coef(model_aggr))
## -----------------------------------------------------------------------------------------------------------------------------
summ_indiv$coef[,2]/summ_aggr$coef[,2]
## -----------------------------------------------------------------------------------------------------------------------------
standard_errors <-
db_aggr %>%
mutate(
# degrees of freedom used to calculate stdy
n_1 = N - 1,
# total sum of squares derived from stdy
S_n_1 = stdy ^ 2 * n_1
) %>%
summarise(
# degrees of freedom
n_k = sum(n_1),
# total sum of squares
S2_k = sum(S_n_1),
# pooled variance (i.e. weighted average of group variances)
p_v = S2_k / n_k,
# aggregate-level residual variance
errorms = summ_aggr$sigma ^ 2,
# inflation factor
factor = sqrt(p_v / errorms),
# aggregate-level standard errors
StdErr = summ_aggr$coef[ , 2],
# individual-level standard errors
standard_error = StdErr * factor
)
## -----------------------------------------------------------------------------------------------------------------------------
library(insight)
standard_errors %>%
mutate(
estimate = coefficients(model_aggr),
z = estimate / standard_error,
p = 2 * (1 - pnorm(abs(z))),
pvalue = format_p(p, stars = TRUE),
CI_low = estimate - 1.96 * standard_error,
CI_high = estimate + 1.96 * standard_error,
) %>%
bind_cols("predictors" = names(standard_errors$standard_error)) %>%
select(predictors, estimate, standard_error, CI_low, CI_high, z, pvalue) %>%
format_table(digits = 3, ci_digits = 3)
db_indiv <- data %>%
rename_all(tolower) %>%
mutate(
# 'Multiply wtgain by a correction factor
wtgaink = wtgain * 0.453592,
# Create dichotomous variable wic with 0 = "Y" and -1 = "N'
wic = 0*(wic == "Y") - 1*(wic == "N"),
# Hispanic = -1 (else 0) to reproduce Model 2
# Note: Hispanic = 1 (else 0) to reproduce Model 3
mracethn1 = -1*(mracethn == 1),
# Non-Hispanic White = 1 (else zero)
mracethn2 = 1*(mracethn == 2),
# Non-Hispanic Black = 1 (else zero)
mracethn3 = 1*(mracethn == 0),
# Recode latecare to 0 for late entry and -1 for no late entry
latecare = latecare - 1
) %>%
select(wtgaink, wic, mracethn1, mracethn2, mracethn3, latecare)
formula_model_1 <- formula("wtgaink ~ wic")
formula_model_2 <- formula("wtgaink ~ wic + mracethn1 + mracethn2 + latecare")
formula_model_3 <- formula("wtgaink ~ wic + mracethn1 + mracethn2 + latecare + mracethn1*latecare + mracethn2*latecare")
model_indiv <-
lm(
formula = formula_model_2,
data = db_indiv
)
summ_indiv <- summary(model_indiv)
summ_indiv

Regression Analysis of Aggregate Data in R

These files are a supplement to my blog post about using R for vertical aggregation in data analysis.

@mbojan
Copy link

mbojan commented Apr 2, 2023

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment