-
-
Save grantmcdermott/7d8f9ea20d2bbf54d3366f5a72482ad9 to your computer and use it in GitHub Desktop.
# Context: https://twitter.com/grant_mcdermott/status/1487528757418102787 | |
library(data.table) | |
library(fixest) | |
bboot = | |
function(object, reps = 100L, cluster = NULL, ...) { | |
fixest_obj = inherits(object, c('fixest', 'fixest_multi')) | |
if (inherits(object, c('lm'))) { | |
Ymat = object$model[, 1] | |
} else if (fixest_obj) { | |
Ymat = model.matrix(object, type = 'lhs', as.matrix = TRUE) | |
} else { | |
stop('\nModel or object class not currently supported.\n') | |
} | |
Xmat = model.matrix(object) | |
n_weights = nrow(Xmat) | |
fmat = NULL | |
if (fixest_obj && !is.null(object$fixef_vars)) { | |
fmat = model.matrix(object, type = 'fixef') | |
} | |
## Have to do a bit of leg work to pull out the clusters and match to | |
## model matrix | |
if (!is.null(cluster)) { | |
if (inherits(cluster, "formula")) { | |
cl_string = strsplit(paste0(cluster)[2], split = ' \\+ ')[[1]] | |
} else { | |
cl_string = paste(cluster) | |
} | |
if (!is.null(fmat) && all(cl_string %in% colnames(fmat))) { | |
cl_mat = fmat[, cl_string] | |
} else if (all(cl_string %in% colnames(Xmat))) { | |
cl_mat = Xmat[, cl_string] | |
} else { | |
DATA = eval(object$call$data) | |
if (all(cl_string %in% names(DATA))) { | |
all_vars = sapply(list(Ymat, Xmat, fmat), colnames) | |
if (inherits(all_vars, 'list')) all_vars = do.call('c', all_vars) | |
all_vars = union(all_vars, cl_string) | |
DATA = data.frame(DATA)[, intersect(colnames(DATA), all_vars)] | |
DATA = DATA[complete.cases(DATA), ] | |
cl_mat = model.matrix(~0+., DATA[, cl_string, drop=FALSE]) | |
} else { | |
stop(paste0('Could not find ', cluster, '. Please provide a valid input.\n')) | |
} | |
} | |
if (!inherits(cl_mat, "matrix")) cl_mat = matrix(cl_mat) | |
n_weights = nrow(unique(cl_mat)) | |
## Keep track of cluster id for consistent weighting within each | |
## cluster later on | |
cl_mat = data.table::as.data.table(cl_mat) | |
cl_mat$cl_id = data.table::frank(cl_mat, ties.method = "dense") | |
} | |
## Pre-allocate space for efficiency | |
wfits = matrix(0, reps, length(object$coefficients)) | |
for (i in 1:reps) { | |
if (is.null(cluster)) { | |
weights = rexp(n_weights, rate = 1) | |
} else { | |
weights = cl_mat[, wt := rexp(1, rate = 1), by = cl_id][, wt] | |
} | |
## Normalise weights | |
## (Unnecessary? https://twitter.com/deaneckles/status/1487506960698200067) | |
# weights = weights / sum(weights) | |
## Demean X and Y matrices if fixed effects are present | |
if (!is.null(fmat)) { | |
Xmat = fixest::demean(X = Xmat, f = fmat, weights = weights) | |
Ymat = fixest::demean(X = Ymat, f = fmat, weights = weights) | |
} | |
## Fit weighted reg | |
wfits[i, ] = lm.wfit(x = Xmat, y = Ymat, w = weights)$coefficients | |
} | |
colnames(wfits) = colnames(Xmat) | |
class(wfits) = "bboot" | |
## Meta attributes | |
attr(wfits, "coefs") = try(coefficients(object), silent = TRUE) | |
attr(wfits, "df") = try(df.residual(object), silent = TRUE) | |
attr(wfits, "se") = try(sqrt(diag(cov(wfits))), silent = TRUE) | |
attr(wfits, "reps") = reps | |
return(wfits) | |
} | |
# | |
## Methods | |
# | |
summary.bboot = function(object, level = 0.95, ...) { | |
alpha = 1 - level | |
lwr = alpha/2 | |
upr = 1-lwr | |
est = attr(object, "coefs") | |
se = attr(object, "se") | |
df = attr(object, "df") | |
cnames = c("Estimate", "Std. Error") | |
tval = as.vector(est)/se | |
if (is.null(df)) df = 0 | |
if (is.finite(df) && df > 0) { | |
pval = 2 * pt(abs(tval), df = df, lower.tail = FALSE) | |
fac = qt(alpha, df = df) | |
cnames = c(cnames, "t value", "Pr(>|t|)") | |
} else { | |
pval = 2 * pnorm(abs(tval), lower.tail = FALSE) | |
fac = qnorm(alpha) | |
cnames = c(cnames, "z value", "Pr(>|z|)") | |
} | |
out = cbind(est, se, tval, pval) | |
colnames(out) = cnames | |
ci = cbind(est + fac * se, est - fac * se) | |
colnames(ci) = c("Lower", "Upper") | |
out = cbind(out, ci) | |
# qtiles = t(apply(object, 2, \(x) c(mean(x), quantile(x, c(lwr, upr))))) | |
# colnames(qtiles) = c("mean", "conf.low", "conf.upper") | |
# attr(out, "quantiles") = qtiles | |
out | |
} | |
print.bboot = function(object, ...) { | |
out = summary(object, ...) | |
cat("Bayesian bootstrap: Standard errors based on", attr(object, "reps"), "reps.", "\n\n") | |
print(out, digits = 4, quote = FALSE, print.gap = 2L) | |
} | |
# | |
## Examples | |
# | |
set.seed(123) | |
mod = lm(mpg ~ wt + hp, mtcars) | |
bb_mod = bboot(mod, reps = 1e3) | |
bb_mod | |
hist(bb_mod[, 'wt'], | |
breaks = 100, | |
border = 'white', | |
main = 'Bayesian bootstrap: wt') | |
# Add cluster variables with a formula | |
set.seed(42) | |
bboot(mod, reps = 1e3, cluster = ~cyl) | |
# Same result for fixest::feols (no FEs) | |
set.seed(42) | |
feols(mpg ~ wt + hp, mtcars) |> | |
bboot(reps = 1e3, cluster = ~cyl) | |
# Incl. FEs is fine too (although adds a bit of overhead) | |
feols(mpg ~ wt + hp | cyl, mtcars, vcov = 'iid') |> | |
bboot(reps = 1e3) | |
# Can combine FEs and clustering as well | |
feols(mpg ~ wt + hp | cyl, mtcars, vcov = ~cyl) |> | |
bboot(reps = 1e3, cluster = ~cyl) |
Thanks again,
I read the tweet that you mentioned about the Bayesian bootstrapping. In fact, this is how I learn about it :). My understanding is that one can think of the Bayesian bootstrapping similarly to how I think of the re-sampling bootstrap, both in terms of usage and interpretation. Thus, I wanted to modify your summary function to look similar to Stata's (re-sampling) bootstrap command.
Lesson learned #1 - I guess I was too technical about bootstrapping. I never thought that once I use the usual confidence interval, I assume that the sampling is approximately normal, so thank you for highlighting that. However, I think that there are circumstances where this assumption is reasonable, and the use of the standard CI and p-values are handy.
lesson learned #2 - Stata’s documentation notes that:
I'm not sure whether this holds for Bayesian bootstrap, but I'm highlighting it just in case you'd be interested.
Sources:
https://www.stata.com/manuals/rbootstrap.pdf#rbootstrap
https://www.stata.com/manuals/rbootstrappostestimation.pdf#rbootstrappostestimation
Hi again @Oravishayrizi. Sorry for the long delay between replies. Too many balls in the air...
I spoke to @zeileis about what he does for sandwhich::vcovBS
and the short answer is just grab the standard Pearson correlation matrix (i.e. produced by applying cov()
on all of the bootstrapped results). No small sample adjustments, which I think is reasonable, although there's been some discussion about what to do in the presence of few clusters.
Anyway, I've adjusted my original bboot
function above to include an SE attribute based on this approach. It's very simple and just calls sqrt(diag(wfits))
. The corresponding summary/print
methods retrieve this attribute and prints it.
Also, since I'm now sticking to what sandwich and lmtest do now, I've also gone ahead and adjusted some other model statistics to be closer in line with your parametric suggestions. The default print method should be what you're looking for. I'm taking some lazy shortcuts, but really this is just a proof-of-concept until Achim or Laurent decide to role something like this into sandwich or fixest. (If not then I might do it for my own ritest package.)
You don't need to apologize.
Thank you very much for the detailed answer!
Thanks very much @Oravishayrizi.
I'm a little bit confused, though. I think your Stata command is just implementing the standard bootstrap method (i.e. resampling rows with replacement). The
bboot
function that I've written above is the Bayesian bootstrap, which is based on re-weighting. See here. Also, I don't see why a parametric CI calculation should be used for the bootstrap. Isn't the point the point of the bootstrap to avoid the parametric restrictions (e.g. assumptions about normality)? I 'm looking very quickly at your code so apologies if I'm missing something!