Skip to content

Instantly share code, notes, and snippets.

@grantmcdermott
Last active May 6, 2022 18:50
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save grantmcdermott/7d8f9ea20d2bbf54d3366f5a72482ad9 to your computer and use it in GitHub Desktop.
Save grantmcdermott/7d8f9ea20d2bbf54d3366f5a72482ad9 to your computer and use it in GitHub Desktop.
Bayesian bootstrap
# Context: https://twitter.com/grant_mcdermott/status/1487528757418102787
library(data.table)
library(fixest)
bboot =
function(object, reps = 100L, cluster = NULL, ...) {
fixest_obj = inherits(object, c('fixest', 'fixest_multi'))
if (inherits(object, c('lm'))) {
Ymat = object$model[, 1]
} else if (fixest_obj) {
Ymat = model.matrix(object, type = 'lhs', as.matrix = TRUE)
} else {
stop('\nModel or object class not currently supported.\n')
}
Xmat = model.matrix(object)
n_weights = nrow(Xmat)
fmat = NULL
if (fixest_obj && !is.null(object$fixef_vars)) {
fmat = model.matrix(object, type = 'fixef')
}
## Have to do a bit of leg work to pull out the clusters and match to
## model matrix
if (!is.null(cluster)) {
if (inherits(cluster, "formula")) {
cl_string = strsplit(paste0(cluster)[2], split = ' \\+ ')[[1]]
} else {
cl_string = paste(cluster)
}
if (!is.null(fmat) && all(cl_string %in% colnames(fmat))) {
cl_mat = fmat[, cl_string]
} else if (all(cl_string %in% colnames(Xmat))) {
cl_mat = Xmat[, cl_string]
} else {
DATA = eval(object$call$data)
if (all(cl_string %in% names(DATA))) {
all_vars = sapply(list(Ymat, Xmat, fmat), colnames)
if (inherits(all_vars, 'list')) all_vars = do.call('c', all_vars)
all_vars = union(all_vars, cl_string)
DATA = data.frame(DATA)[, intersect(colnames(DATA), all_vars)]
DATA = DATA[complete.cases(DATA), ]
cl_mat = model.matrix(~0+., DATA[, cl_string, drop=FALSE])
} else {
stop(paste0('Could not find ', cluster, '. Please provide a valid input.\n'))
}
}
if (!inherits(cl_mat, "matrix")) cl_mat = matrix(cl_mat)
n_weights = nrow(unique(cl_mat))
## Keep track of cluster id for consistent weighting within each
## cluster later on
cl_mat = data.table::as.data.table(cl_mat)
cl_mat$cl_id = data.table::frank(cl_mat, ties.method = "dense")
}
## Pre-allocate space for efficiency
wfits = matrix(0, reps, length(object$coefficients))
for (i in 1:reps) {
if (is.null(cluster)) {
weights = rexp(n_weights, rate = 1)
} else {
weights = cl_mat[, wt := rexp(1, rate = 1), by = cl_id][, wt]
}
## Normalise weights
## (Unnecessary? https://twitter.com/deaneckles/status/1487506960698200067)
# weights = weights / sum(weights)
## Demean X and Y matrices if fixed effects are present
if (!is.null(fmat)) {
Xmat = fixest::demean(X = Xmat, f = fmat, weights = weights)
Ymat = fixest::demean(X = Ymat, f = fmat, weights = weights)
}
## Fit weighted reg
wfits[i, ] = lm.wfit(x = Xmat, y = Ymat, w = weights)$coefficients
}
colnames(wfits) = colnames(Xmat)
class(wfits) = "bboot"
## Meta attributes
attr(wfits, "coefs") = try(coefficients(object), silent = TRUE)
attr(wfits, "df") = try(df.residual(object), silent = TRUE)
attr(wfits, "se") = try(sqrt(diag(cov(wfits))), silent = TRUE)
attr(wfits, "reps") = reps
return(wfits)
}
#
## Methods
#
summary.bboot = function(object, level = 0.95, ...) {
alpha = 1 - level
lwr = alpha/2
upr = 1-lwr
est = attr(object, "coefs")
se = attr(object, "se")
df = attr(object, "df")
cnames = c("Estimate", "Std. Error")
tval = as.vector(est)/se
if (is.null(df)) df = 0
if (is.finite(df) && df > 0) {
pval = 2 * pt(abs(tval), df = df, lower.tail = FALSE)
fac = qt(alpha, df = df)
cnames = c(cnames, "t value", "Pr(>|t|)")
} else {
pval = 2 * pnorm(abs(tval), lower.tail = FALSE)
fac = qnorm(alpha)
cnames = c(cnames, "z value", "Pr(>|z|)")
}
out = cbind(est, se, tval, pval)
colnames(out) = cnames
ci = cbind(est + fac * se, est - fac * se)
colnames(ci) = c("Lower", "Upper")
out = cbind(out, ci)
# qtiles = t(apply(object, 2, \(x) c(mean(x), quantile(x, c(lwr, upr)))))
# colnames(qtiles) = c("mean", "conf.low", "conf.upper")
# attr(out, "quantiles") = qtiles
out
}
print.bboot = function(object, ...) {
out = summary(object, ...)
cat("Bayesian bootstrap: Standard errors based on", attr(object, "reps"), "reps.", "\n\n")
print(out, digits = 4, quote = FALSE, print.gap = 2L)
}
#
## Examples
#
set.seed(123)
mod = lm(mpg ~ wt + hp, mtcars)
bb_mod = bboot(mod, reps = 1e3)
bb_mod
hist(bb_mod[, 'wt'],
breaks = 100,
border = 'white',
main = 'Bayesian bootstrap: wt')
# Add cluster variables with a formula
set.seed(42)
bboot(mod, reps = 1e3, cluster = ~cyl)
# Same result for fixest::feols (no FEs)
set.seed(42)
feols(mpg ~ wt + hp, mtcars) |>
bboot(reps = 1e3, cluster = ~cyl)
# Incl. FEs is fine too (although adds a bit of overhead)
feols(mpg ~ wt + hp | cyl, mtcars, vcov = 'iid') |>
bboot(reps = 1e3)
# Can combine FEs and clustering as well
feols(mpg ~ wt + hp | cyl, mtcars, vcov = ~cyl) |>
bboot(reps = 1e3, cluster = ~cyl)
@Oravishayrizi
Copy link

You don't need to apologize.
Thank you very much for the detailed answer!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment