grantmcdermott/bboot.R

## bboot.R
# Context: https://twitter.com/grant_mcdermott/status/1487528757418102787

library(data.table)
library(fixest)

bboot =
  function(object, reps = 100L, cluster = NULL, ...) {

    fixest_obj = inherits(object, c('fixest', 'fixest_multi'))

    if (inherits(object, c('lm'))) {
      Ymat = object$model[, 1]
    } else if (fixest_obj) {
      Ymat = model.matrix(object, type = 'lhs', as.matrix = TRUE)
    } else {
      stop('\nModel or object class not currently supported.\n')
    }

    Xmat = model.matrix(object)
    n_weights = nrow(Xmat)

    fmat = NULL
    if (fixest_obj && !is.null(object$fixef_vars)) {
      fmat = model.matrix(object, type = 'fixef')
    }


    ## Have to do a bit of leg work to pull out the clusters and match to
    ## model matrix
    if (!is.null(cluster)) {

      if (inherits(cluster, "formula")) {
        cl_string = strsplit(paste0(cluster)[2], split = ' \\+ ')[[1]]
      } else {
        cl_string = paste(cluster)
      }
      if (!is.null(fmat) && all(cl_string %in% colnames(fmat))) {
        cl_mat = fmat[, cl_string]
      } else if (all(cl_string %in% colnames(Xmat))) {
        cl_mat = Xmat[, cl_string]
      } else {
        DATA = eval(object$call$data)
        if (all(cl_string %in% names(DATA))) {
          all_vars = sapply(list(Ymat, Xmat, fmat), colnames)
          if (inherits(all_vars, 'list')) all_vars = do.call('c', all_vars)
          all_vars = union(all_vars, cl_string)
          DATA = data.frame(DATA)[, intersect(colnames(DATA), all_vars)]
          DATA = DATA[complete.cases(DATA), ]
          cl_mat = model.matrix(~0+., DATA[, cl_string, drop=FALSE])
        } else {
          stop(paste0('Could not find ', cluster, '. Please provide a valid input.\n'))
        }
      }

      if (!inherits(cl_mat, "matrix")) cl_mat = matrix(cl_mat)
      n_weights = nrow(unique(cl_mat))

      ## Keep track of cluster id for consistent weighting  within each
      ## cluster later on
      cl_mat = data.table::as.data.table(cl_mat)
      cl_mat$cl_id = data.table::frank(cl_mat, ties.method = "dense")

    }

    ## Pre-allocate space for efficiency
    wfits = matrix(0, reps, length(object$coefficients))

    for (i in 1:reps) {

      if (is.null(cluster)) {
        weights = rexp(n_weights, rate = 1)
      } else {
        weights = cl_mat[, wt := rexp(1, rate = 1), by = cl_id][, wt]
      }

      ## Normalise weights
      ## (Unnecessary? https://twitter.com/deaneckles/status/1487506960698200067)
      # weights = weights / sum(weights)

      ## Demean X and Y matrices if fixed effects are present
      if (!is.null(fmat)) {
        Xmat = fixest::demean(X = Xmat, f = fmat, weights = weights)
        Ymat = fixest::demean(X = Ymat, f = fmat, weights = weights)
      }

      ## Fit weighted reg
      wfits[i, ] = lm.wfit(x = Xmat, y = Ymat, w = weights)$coefficients
    }

    colnames(wfits) = colnames(Xmat)

    class(wfits) = "bboot"

    ## Meta attributes
    attr(wfits, "coefs") = try(coefficients(object), silent = TRUE)
    attr(wfits, "df") = try(df.residual(object), silent = TRUE)
    attr(wfits, "se") = try(sqrt(diag(cov(wfits))), silent = TRUE)
    attr(wfits, "reps") = reps

    return(wfits)

  }


#
## Methods
#

summary.bboot = function(object, level = 0.95, ...) {
  alpha = 1 - level
  lwr = alpha/2
  upr = 1-lwr
  est = attr(object, "coefs")
  se = attr(object, "se")
  df = attr(object, "df")

  cnames = c("Estimate", "Std. Error")

  tval = as.vector(est)/se
  if (is.null(df)) df = 0
  if (is.finite(df) && df > 0) {
    pval = 2 * pt(abs(tval), df = df, lower.tail = FALSE)
    fac = qt(alpha, df = df)
    cnames = c(cnames, "t value", "Pr(>|t|)")
  } else {
    pval = 2 * pnorm(abs(tval), lower.tail = FALSE)
    fac = qnorm(alpha)
    cnames = c(cnames, "z value", "Pr(>|z|)")
  }

  out = cbind(est, se, tval, pval)
  colnames(out) = cnames

  ci = cbind(est + fac * se, est - fac * se)
  colnames(ci) = c("Lower", "Upper")

  out = cbind(out, ci)

  # qtiles = t(apply(object, 2, \(x) c(mean(x), quantile(x, c(lwr, upr)))))
  # colnames(qtiles) = c("mean", "conf.low", "conf.upper")
  # attr(out, "quantiles") = qtiles

  out
}

print.bboot = function(object, ...) {
  out = summary(object, ...)
  cat("Bayesian bootstrap: Standard errors based on", attr(object, "reps"), "reps.", "\n\n")
  print(out, digits = 4, quote = FALSE, print.gap = 2L)
}


#
## Examples
#

set.seed(123)

mod = lm(mpg ~ wt + hp, mtcars)
bb_mod = bboot(mod, reps = 1e3)
bb_mod
hist(bb_mod[, 'wt'],
     breaks = 100,
     border = 'white',
     main = 'Bayesian bootstrap: wt')

# Add cluster variables with a formula
set.seed(42)
bboot(mod, reps = 1e3, cluster = ~cyl)

# Same result for fixest::feols (no FEs)
set.seed(42)
feols(mpg ~ wt + hp, mtcars) |>
  bboot(reps = 1e3, cluster = ~cyl)

# Incl. FEs is fine too (although adds a bit of overhead)
feols(mpg ~ wt + hp | cyl, mtcars, vcov = 'iid') |>
  bboot(reps = 1e3)

# Can combine FEs and clustering as well
feols(mpg ~ wt + hp | cyl, mtcars, vcov = ~cyl) |>
  bboot(reps = 1e3, cluster = ~cyl)
	# Context: https://twitter.com/grant_mcdermott/status/1487528757418102787

	library(data.table)
	library(fixest)

	bboot =
	function(object, reps = 100L, cluster = NULL, ...) {

	fixest_obj = inherits(object, c('fixest', 'fixest_multi'))

	if (inherits(object, c('lm'))) {
	Ymat = object$model[, 1]
	} else if (fixest_obj) {
	Ymat = model.matrix(object, type = 'lhs', as.matrix = TRUE)
	} else {
	stop('\nModel or object class not currently supported.\n')
	}

	Xmat = model.matrix(object)
	n_weights = nrow(Xmat)

	fmat = NULL
	if (fixest_obj && !is.null(object$fixef_vars)) {
	fmat = model.matrix(object, type = 'fixef')
	}


	## Have to do a bit of leg work to pull out the clusters and match to
	## model matrix
	if (!is.null(cluster)) {

	if (inherits(cluster, "formula")) {
	cl_string = strsplit(paste0(cluster)[2], split = ' \\+ ')[[1]]
	} else {
	cl_string = paste(cluster)
	}
	if (!is.null(fmat) && all(cl_string %in% colnames(fmat))) {
	cl_mat = fmat[, cl_string]
	} else if (all(cl_string %in% colnames(Xmat))) {
	cl_mat = Xmat[, cl_string]
	} else {
	DATA = eval(object$call$data)
	if (all(cl_string %in% names(DATA))) {
	all_vars = sapply(list(Ymat, Xmat, fmat), colnames)
	if (inherits(all_vars, 'list')) all_vars = do.call('c', all_vars)
	all_vars = union(all_vars, cl_string)
	DATA = data.frame(DATA)[, intersect(colnames(DATA), all_vars)]
	DATA = DATA[complete.cases(DATA), ]
	cl_mat = model.matrix(~0+., DATA[, cl_string, drop=FALSE])
	} else {
	stop(paste0('Could not find ', cluster, '. Please provide a valid input.\n'))
	}
	}

	if (!inherits(cl_mat, "matrix")) cl_mat = matrix(cl_mat)
	n_weights = nrow(unique(cl_mat))

	## Keep track of cluster id for consistent weighting within each
	## cluster later on
	cl_mat = data.table::as.data.table(cl_mat)
	cl_mat$cl_id = data.table::frank(cl_mat, ties.method = "dense")

	}

	## Pre-allocate space for efficiency
	wfits = matrix(0, reps, length(object$coefficients))

	for (i in 1:reps) {

	if (is.null(cluster)) {
	weights = rexp(n_weights, rate = 1)
	} else {
	weights = cl_mat[, wt := rexp(1, rate = 1), by = cl_id][, wt]
	}

	## Normalise weights
	## (Unnecessary? https://twitter.com/deaneckles/status/1487506960698200067)
	# weights = weights / sum(weights)

	## Demean X and Y matrices if fixed effects are present
	if (!is.null(fmat)) {
	Xmat = fixest::demean(X = Xmat, f = fmat, weights = weights)
	Ymat = fixest::demean(X = Ymat, f = fmat, weights = weights)
	}

	## Fit weighted reg
	wfits[i, ] = lm.wfit(x = Xmat, y = Ymat, w = weights)$coefficients
	}

	colnames(wfits) = colnames(Xmat)

	class(wfits) = "bboot"

	## Meta attributes
	attr(wfits, "coefs") = try(coefficients(object), silent = TRUE)
	attr(wfits, "df") = try(df.residual(object), silent = TRUE)
	attr(wfits, "se") = try(sqrt(diag(cov(wfits))), silent = TRUE)
	attr(wfits, "reps") = reps

	return(wfits)

	}


	#
	## Methods
	#

	summary.bboot = function(object, level = 0.95, ...) {
	alpha = 1 - level
	lwr = alpha/2
	upr = 1-lwr
	est = attr(object, "coefs")
	se = attr(object, "se")
	df = attr(object, "df")

	cnames = c("Estimate", "Std. Error")

	tval = as.vector(est)/se
	if (is.null(df)) df = 0
	if (is.finite(df) && df > 0) {
	pval = 2 * pt(abs(tval), df = df, lower.tail = FALSE)
	fac = qt(alpha, df = df)
	cnames = c(cnames, "t value", "Pr(>\|t\|)")
	} else {
	pval = 2 * pnorm(abs(tval), lower.tail = FALSE)
	fac = qnorm(alpha)
	cnames = c(cnames, "z value", "Pr(>\|z\|)")
	}

	out = cbind(est, se, tval, pval)
	colnames(out) = cnames

	ci = cbind(est + fac * se, est - fac * se)
	colnames(ci) = c("Lower", "Upper")

	out = cbind(out, ci)

	# qtiles = t(apply(object, 2, \(x) c(mean(x), quantile(x, c(lwr, upr)))))
	# colnames(qtiles) = c("mean", "conf.low", "conf.upper")
	# attr(out, "quantiles") = qtiles

	out
	}

	print.bboot = function(object, ...) {
	out = summary(object, ...)
	cat("Bayesian bootstrap: Standard errors based on", attr(object, "reps"), "reps.", "\n\n")
	print(out, digits = 4, quote = FALSE, print.gap = 2L)
	}


	#
	## Examples
	#

	set.seed(123)

	mod = lm(mpg ~ wt + hp, mtcars)
	bb_mod = bboot(mod, reps = 1e3)
	bb_mod
	hist(bb_mod[, 'wt'],
	breaks = 100,
	border = 'white',
	main = 'Bayesian bootstrap: wt')

	# Add cluster variables with a formula
	set.seed(42)
	bboot(mod, reps = 1e3, cluster = ~cyl)

	# Same result for fixest::feols (no FEs)
	set.seed(42)
	feols(mpg ~ wt + hp, mtcars) \|>
	bboot(reps = 1e3, cluster = ~cyl)

	# Incl. FEs is fine too (although adds a bit of overhead)
	feols(mpg ~ wt + hp \| cyl, mtcars, vcov = 'iid') \|>
	bboot(reps = 1e3)

	# Can combine FEs and clustering as well
	feols(mpg ~ wt + hp \| cyl, mtcars, vcov = ~cyl) \|>
	bboot(reps = 1e3, cluster = ~cyl)