Regression Analysis of Aggregate Data in R
These files are a supplement to my blog post about using R for vertical aggregation in data analysis.
## ----------------------------------------------------------------------------------------------------------------------------- | |
library(readtext) | |
library(dplyr) | |
data <- readtext("https://cdn-links.lww.com/permalink/ede/a/ede_25_6_2014_07_23_urquia_ede14-408_sdc1.doc") %>% | |
subset(select = "text") %>% | |
{ read.table(text=sub(".*4) individual-level dataset", "", .), header=TRUE) } | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
db_indiv <- data %>% | |
rename_all(tolower) %>% | |
mutate( | |
wtgaink = wtgain * 0.453592, | |
wic = as.factor(wic), | |
mracethn = as.factor(mracethn), | |
mracethn = relevel(mracethn, ref = 3), | |
latecare = as.factor(latecare) | |
) %>% | |
select(wtgaink, wic, mracethn, latecare) | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
model_indiv <- | |
lm( | |
formula = wtgaink ~ wic + mracethn + latecare, | |
data = db_indiv | |
) | |
summ_indiv <- summary(model_indiv) | |
summ_indiv | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
db_aggr <- | |
db_indiv %>% | |
group_by(across(c(wic, mracethn, latecare))) %>% | |
summarise( | |
meany = mean(wtgaink), | |
stdy = sd(wtgaink), | |
N = length(wtgaink) | |
) %>% | |
ungroup() | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
model_aggr <- | |
lm( | |
formula = meany ~ wic + mracethn + latecare, | |
data = db_aggr, | |
weights = N | |
) | |
summ_aggr <- summary(model_aggr) | |
summ_aggr | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
all.equal(coef(model_indiv),coef(model_aggr)) | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
summ_indiv$coef[,2]/summ_aggr$coef[,2] | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
standard_errors <- | |
db_aggr %>% | |
mutate( | |
# degrees of freedom used to calculate stdy | |
n_1 = N - 1, | |
# total sum of squares derived from stdy | |
S_n_1 = stdy ^ 2 * n_1 | |
) %>% | |
summarise( | |
# degrees of freedom | |
n_k = sum(n_1), | |
# total sum of squares | |
S2_k = sum(S_n_1), | |
# pooled variance (i.e. weighted average of group variances) | |
p_v = S2_k / n_k, | |
# aggregate-level residual variance | |
errorms = summ_aggr$sigma ^ 2, | |
# inflation factor | |
factor = sqrt(p_v / errorms), | |
# aggregate-level standard errors | |
StdErr = summ_aggr$coef[ , 2], | |
# individual-level standard errors | |
standard_error = StdErr * factor | |
) | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
library(insight) | |
standard_errors %>% | |
mutate( | |
estimate = coefficients(model_aggr), | |
z = estimate / standard_error, | |
p = 2 * (1 - pnorm(abs(z))), | |
pvalue = format_p(p, stars = TRUE), | |
CI_low = estimate - 1.96 * standard_error, | |
CI_high = estimate + 1.96 * standard_error, | |
) %>% | |
bind_cols("predictors" = names(standard_errors$standard_error)) %>% | |
select(predictors, estimate, standard_error, CI_low, CI_high, z, pvalue) %>% | |
format_table(digits = 3, ci_digits = 3) |
db_indiv <- data %>% | |
rename_all(tolower) %>% | |
mutate( | |
# 'Multiply wtgain by a correction factor | |
wtgaink = wtgain * 0.453592, | |
# Create dichotomous variable wic with 0 = "Y" and -1 = "N' | |
wic = 0*(wic == "Y") - 1*(wic == "N"), | |
# Hispanic = -1 (else 0) to reproduce Model 2 | |
# Note: Hispanic = 1 (else 0) to reproduce Model 3 | |
mracethn1 = -1*(mracethn == 1), | |
# Non-Hispanic White = 1 (else zero) | |
mracethn2 = 1*(mracethn == 2), | |
# Non-Hispanic Black = 1 (else zero) | |
mracethn3 = 1*(mracethn == 0), | |
# Recode latecare to 0 for late entry and -1 for no late entry | |
latecare = latecare - 1 | |
) %>% | |
select(wtgaink, wic, mracethn1, mracethn2, mracethn3, latecare) | |
formula_model_1 <- formula("wtgaink ~ wic") | |
formula_model_2 <- formula("wtgaink ~ wic + mracethn1 + mracethn2 + latecare") | |
formula_model_3 <- formula("wtgaink ~ wic + mracethn1 + mracethn2 + latecare + mracethn1*latecare + mracethn2*latecare") | |
model_indiv <- | |
lm( | |
formula = formula_model_2, | |
data = db_indiv | |
) | |
summ_indiv <- summary(model_indiv) | |
summ_indiv |
These files are a supplement to my blog post about using R for vertical aggregation in data analysis.
Thanks. Fyi, I asked myself similar questions here https://blog.michalbojanowski.com/2015/09/linear-models-with-weighted-observations/