These files are a supplement to my blog post about using R for vertical aggregation in data analysis.
Last active
April 2, 2023 18:30
-
-
Save pjastam/14cfd2a60b0787239ed6a2f18c3ec03b to your computer and use it in GitHub Desktop.
Regression Analysis of Aggregate Data in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ----------------------------------------------------------------------------------------------------------------------------- | |
library(readtext) | |
library(dplyr) | |
data <- readtext("https://cdn-links.lww.com/permalink/ede/a/ede_25_6_2014_07_23_urquia_ede14-408_sdc1.doc") %>% | |
subset(select = "text") %>% | |
{ read.table(text=sub(".*4) individual-level dataset", "", .), header=TRUE) } | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
db_indiv <- data %>% | |
rename_all(tolower) %>% | |
mutate( | |
wtgaink = wtgain * 0.453592, | |
wic = as.factor(wic), | |
mracethn = as.factor(mracethn), | |
mracethn = relevel(mracethn, ref = 3), | |
latecare = as.factor(latecare) | |
) %>% | |
select(wtgaink, wic, mracethn, latecare) | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
model_indiv <- | |
lm( | |
formula = wtgaink ~ wic + mracethn + latecare, | |
data = db_indiv | |
) | |
summ_indiv <- summary(model_indiv) | |
summ_indiv | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
db_aggr <- | |
db_indiv %>% | |
group_by(across(c(wic, mracethn, latecare))) %>% | |
summarise( | |
meany = mean(wtgaink), | |
stdy = sd(wtgaink), | |
N = length(wtgaink) | |
) %>% | |
ungroup() | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
model_aggr <- | |
lm( | |
formula = meany ~ wic + mracethn + latecare, | |
data = db_aggr, | |
weights = N | |
) | |
summ_aggr <- summary(model_aggr) | |
summ_aggr | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
all.equal(coef(model_indiv),coef(model_aggr)) | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
summ_indiv$coef[,2]/summ_aggr$coef[,2] | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
standard_errors <- | |
db_aggr %>% | |
mutate( | |
# degrees of freedom used to calculate stdy | |
n_1 = N - 1, | |
# total sum of squares derived from stdy | |
S_n_1 = stdy ^ 2 * n_1 | |
) %>% | |
summarise( | |
# degrees of freedom | |
n_k = sum(n_1), | |
# total sum of squares | |
S2_k = sum(S_n_1), | |
# pooled variance (i.e. weighted average of group variances) | |
p_v = S2_k / n_k, | |
# aggregate-level residual variance | |
errorms = summ_aggr$sigma ^ 2, | |
# inflation factor | |
factor = sqrt(p_v / errorms), | |
# aggregate-level standard errors | |
StdErr = summ_aggr$coef[ , 2], | |
# individual-level standard errors | |
standard_error = StdErr * factor | |
) | |
## ----------------------------------------------------------------------------------------------------------------------------- | |
library(insight) | |
standard_errors %>% | |
mutate( | |
estimate = coefficients(model_aggr), | |
z = estimate / standard_error, | |
p = 2 * (1 - pnorm(abs(z))), | |
pvalue = format_p(p, stars = TRUE), | |
CI_low = estimate - 1.96 * standard_error, | |
CI_high = estimate + 1.96 * standard_error, | |
) %>% | |
bind_cols("predictors" = names(standard_errors$standard_error)) %>% | |
select(predictors, estimate, standard_error, CI_low, CI_high, z, pvalue) %>% | |
format_table(digits = 3, ci_digits = 3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
db_indiv <- data %>% | |
rename_all(tolower) %>% | |
mutate( | |
# 'Multiply wtgain by a correction factor | |
wtgaink = wtgain * 0.453592, | |
# Create dichotomous variable wic with 0 = "Y" and -1 = "N' | |
wic = 0*(wic == "Y") - 1*(wic == "N"), | |
# Hispanic = -1 (else 0) to reproduce Model 2 | |
# Note: Hispanic = 1 (else 0) to reproduce Model 3 | |
mracethn1 = -1*(mracethn == 1), | |
# Non-Hispanic White = 1 (else zero) | |
mracethn2 = 1*(mracethn == 2), | |
# Non-Hispanic Black = 1 (else zero) | |
mracethn3 = 1*(mracethn == 0), | |
# Recode latecare to 0 for late entry and -1 for no late entry | |
latecare = latecare - 1 | |
) %>% | |
select(wtgaink, wic, mracethn1, mracethn2, mracethn3, latecare) | |
formula_model_1 <- formula("wtgaink ~ wic") | |
formula_model_2 <- formula("wtgaink ~ wic + mracethn1 + mracethn2 + latecare") | |
formula_model_3 <- formula("wtgaink ~ wic + mracethn1 + mracethn2 + latecare + mracethn1*latecare + mracethn2*latecare") | |
model_indiv <- | |
lm( | |
formula = formula_model_2, | |
data = db_indiv | |
) | |
summ_indiv <- summary(model_indiv) | |
summ_indiv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks. Fyi, I asked myself similar questions here https://blog.michalbojanowski.com/2015/09/linear-models-with-weighted-observations/