Fast implementation of hatvalues.fixest
. This calculates (or approximates) the diagonals of the projection matrix
-
-
Save kylebutts/ea05599a99b86e29ad2f39956e339ab4 to your computer and use it in GitHub Desktop.
Comparing p = 500 and 1M fixed effects to Kline, Saggio, and Sølvsten in their computational appendix. They took ~250 seconds to approximate with parallelization. This takes 100 seconds without parallelization. With just fixed effects and no covariates, 2.33 seconds.
library(data.table)
library(fixest)
library(Matrix)
library(MatrixExtra)
#> 'MatrixExtra' modifies important behaviors from 'Matrix'. See ?MatrixExtra-options.
library(tictoc)
#>
#> Attaching package: 'tictoc'
#> The following object is masked from 'package:data.table':
#>
#> shift
# `hatvalues` for feols models -------------------------------------------------
hatvalues.fixest <- function(model, exact = TRUE, p = 1000) {
X = did2s:::sparse_model_matrix(df, model)
# If fixed effects only, flag
fe_only = is.null(X$mat_RHS)
X = cbind(X$mat_RHS, X$mat_FE)
S_xx = crossprod(X)
# If there are only fixed effects, then can do this *super fast* with FEs
if (fe_only == TRUE) {
# https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i
U = Matrix::chol(S_xx)
Z = solve(t(U), t(X))
P_ii = Matrix::colSums(Z ^ 2)
return(P_ii)
# P_ii[1:10]
# [1] 0.2000000 0.2000000 0.2000000 0.2000000 0.2000000 0.3333333 0.3333333 0.3333333 0.1428571 0.1428571
}
# Note: This might fail if looking at too large of a matrix
if (exact == TRUE) {
Z = solve(S_xx, t(X))
P_ii = diag(X %*% Z)
return(P_ii)
# P_ii[1:10]
# [1] 0.3344063 0.3336252 0.3335790 0.3339978 0.3337960 0.3333515 0.3333510 0.3335557 0.3336988 0.2001362
# mean(P_ii)
# [1] 0.2731241
}
if (exact == FALSE) {
n <- nrow(X)
k <- ncol(X)
# TODO: think about using fixest object to calculate \hat{\beta} = (X'X)^{-1} X' R_pi for i = 1, ..., p
# Using speed heuristics based on n*p and trying
# Solve for Z = (X'X)^{-1} (X' R_p)
if (n*p <= 1000000) {
# Fastest for small n*p (b/c of my foor loop)
Rp <- matrix(rbinom(n * p, 1, 0.5) * 2 - 1, nrow = p, ncol = n)
X_Rp <- tcrossprod(t(X), Rp)
Z = solve(S_xx, X_Rp)
# P_ii = 1/p || X_i' Z ||^2
P_ii = colSums(crossprod(Z, t(X))^2) / p
} else if (n*p <= 10000000) {
# Memory efficient but Slower
X_Rp <- matrix(0, nrow = k, ncol = p)
for (j in 1:p) {
Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
X_Rp[, j] <- tcrossprod(t(X), Rp_j)
}
Z = solve(S_xx, X_Rp)
# P_ii = 1/p || X_i' Z ||^2
# Needed for speed. Dense already, so no memory difference
Z_dense = as.matrix(Z)
P_ii = rep(0, nrow(X))
for (j in 1:ncol(Z_dense)) {
Zj = Z_dense[, j, drop = FALSE]
P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
}
} else {
# Slightly slower, but very memory efficient
P_ii = rep(0, nrow(X))
for (j in 1:p) {
Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
X_Rp_j <- tcrossprod(t(X), Rp_j)
Zj <- solve(S_xx, X_Rp_j)
P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
}
}
return(P_ii)
# Checking, when p = 1000000, n = 3665 (1000 FEs)
# P_ii[1:10]
# [1] 0.3338723 0.3330020 0.3329582 0.3340253 0.3338307 0.3333825 0.3327888 0.3329766 0.3331747 0.2001476
# mean(P_ii)
# [1] 0.2731081
}
}
# Test -------------------------------------------------------------------------
N = 1000000
var_fes = 10
var_eps = 4
# Ensure P_ii exists
N_i = pmax(2, rpois(N, 3))
# Create dataset
df = data.table(id = rep(1:N, N_i))
df[, j := 1:.N, by = id]
df[, fe_i := rnorm(1, 0, sd = sqrt(var_fes)), by = id]
df[, x := rnorm(.N, mean = fe_i, sd = 1)]
df[, y := x * 1 + fe_i + rnorm(.N, 0, sqrt(var_eps))]
# FE only model ----------------------------------------------------------------
est = feols(y ~ 0 | id, data = df)
tic()
h = hatvalues(est)
toc()
#> 2.333 sec elapsed
# Covariates -------------------------------------------------------------------
est = feols(y ~ x | id, data = df)
tic()
h_approx_1k = hatvalues(est, exact = FALSE, p = 500)
toc()
#> 100.744 sec elapsed
Created on 2023-05-08 with reprex v2.0.2
This uses the JLA approximation as described in Kline, Saggio, and Sølvsten. The hatleverage
function has two parameters. exact
which determines whether to solve for
p
is used to determine
the number of draws of the Rademacher weights will be used and larger values
increase the accuracy of estimation. Even for small values of p, the average
One thing to note is that estimated
library(data.table)
library(fixest)
library(Matrix)
library(MatrixExtra)
#> 'MatrixExtra' modifies important behaviors from 'Matrix'. See ?MatrixExtra-options.
library(tictoc)
#>
#> Attaching package: 'tictoc'
#> The following object is masked from 'package:data.table':
#>
#> shift
# `hatvalues` for feols models -------------------------------------------------
hatvalues.fixest <- function(model, exact = TRUE, p = 1000) {
X = did2s:::sparse_model_matrix(df, model)
# If fixed effects only, flag
fe_only = is.null(X$mat_RHS)
X = cbind(X$mat_RHS, X$mat_FE)
S_xx = crossprod(X)
# If there are only fixed effects, then can do this *super fast* with FEs
if (fe_only == TRUE) {
# https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i
U = Matrix::chol(S_xx)
Z = solve(t(U), t(X))
P_ii = Matrix::colSums(Z ^ 2)
return(P_ii)
# P_ii[1:10]
# [1] 0.2000000 0.2000000 0.2000000 0.2000000 0.2000000 0.3333333 0.3333333 0.3333333 0.1428571 0.1428571
}
# Note: This might fail if looking at too large of a matrix
if (exact == TRUE) {
Z = solve(S_xx, t(X))
P_ii = diag(X %*% Z)
return(P_ii)
# P_ii[1:10]
# [1] 0.3344063 0.3336252 0.3335790 0.3339978 0.3337960 0.3333515 0.3333510 0.3335557 0.3336988 0.2001362
# mean(P_ii)
# [1] 0.2731241
}
if (exact == FALSE) {
n <- nrow(X)
k <- ncol(X)
# TODO: think about using fixest object to calculate \hat{\beta} = (X'X)^{-1} X' R_pi for i = 1, ..., p
# Using speed heuristics based on n*p and trying
# Solve for Z = (X'X)^{-1} (X' R_p)
if (n*p <= 1000000) {
# Fastest for small n*p (b/c of my foor loop)
Rp <- matrix(rbinom(n * p, 1, 0.5) * 2 - 1, nrow = p, ncol = n)
X_Rp <- tcrossprod(t(X), Rp)
Z = solve(S_xx, X_Rp)
# P_ii = 1/p || X_i' Z ||^2
P_ii = colSums(crossprod(Z, t(X))^2) / p
} else if (n*p <= 10000000) {
# Memory efficient but Slower
X_Rp <- matrix(0, nrow = k, ncol = p)
for (j in 1:p) {
Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
X_Rp[, j] <- tcrossprod(t(X), Rp_j)
}
Z = solve(S_xx, X_Rp)
# P_ii = 1/p || X_i' Z ||^2
# Needed for speed. Dense already, so no memory difference
Z_dense = as.matrix(Z)
P_ii = rep(0, nrow(X))
for (j in 1:ncol(Z_dense)) {
Zj = Z_dense[, j, drop = FALSE]
P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
}
} else {
# Slightly slower, but very memory efficient
P_ii = rep(0, nrow(X))
for (j in 1:p) {
Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
X_Rp_j <- tcrossprod(t(X), Rp_j)
Zj <- solve(S_xx, X_Rp_j)
P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
}
}
return(P_ii)
# Checking, when p = 1000000, n = 3665 (1000 FEs)
# P_ii[1:10]
# [1] 0.3338723 0.3330020 0.3329582 0.3340253 0.3338307 0.3333825 0.3327888 0.3329766 0.3331747 0.2001476
# mean(P_ii)
# [1] 0.2731081
}
}
# Test -------------------------------------------------------------------------
N = 1000
var_fes = 10
var_eps = 4
# Ensure P_ii exists
N_i = pmax(2, rpois(N, 3))
# Create dataset
df = data.table(id = rep(1:N, N_i))
df[, j := 1:.N, by = id]
df[, fe_i := rnorm(1, 0, sd = sqrt(var_fes)), by = id]
df[, x := rnorm(.N, mean = fe_i, sd = 1)]
df[, y := x * 1 + fe_i + rnorm(.N, 0, sqrt(var_eps))]
# FE only model ----------------------------------------------------------------
est = feols(y ~ 0 | id, data = df)
tic()
h = hatvalues(est)
toc()
#> 0.048 sec elapsed
# Covariates -------------------------------------------------------------------
est = feols(y ~ x | id, data = df)
tic()
h_exact = hatvalues(est)
toc()
#> 0.532 sec elapsed
tic()
h_approx_1k = hatvalues(est, exact = FALSE, p = 1000)
toc()
#> 0.974 sec elapsed
tic()
h_approx_10k = hatvalues(est, exact = FALSE, p = 10000)
toc()
#> 9.101 sec elapsed
# How good is the approximation?
diff_1k = h_exact - h_approx_1k
mean(diff_1k)
#> [1] 0.0001043958
sd(diff_1k)
#> [1] 0.01128287
diff_10k = h_exact - h_approx_10k
mean(diff_10k)
#> [1] -0.0002276724
sd(diff_10k)
#> [1] 0.003602415
Created on 2023-05-08 with reprex v2.0.2
library(data.table)
library(fixest)
library(Matrix)
library(MatrixExtra)
#> 'MatrixExtra' modifies important behaviors from 'Matrix'. See ?MatrixExtra-options.
library(tictoc)
#>
#> Attaching package: 'tictoc'
#> The following object is masked from 'package:data.table':
#>
#> shift
# `hatvalues` for feols models -------------------------------------------------
hatvalues.fixest <- function(model, exact = TRUE, p = 1000) {
X = did2s:::sparse_model_matrix(df, model)
# If fixed effects only, flag
fe_only = is.null(X$mat_RHS)
X = cbind(X$mat_RHS, X$mat_FE)
S_xx = crossprod(X)
# If there are only fixed effects, then can do this *super fast* with FEs
if (fe_only == TRUE) {
# https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i
U = Matrix::chol(S_xx)
Z = solve(t(U), t(X))
P_ii = Matrix::colSums(Z ^ 2)
return(P_ii)
# P_ii[1:10]
# [1] 0.2000000 0.2000000 0.2000000 0.2000000 0.2000000 0.3333333 0.3333333 0.3333333 0.1428571 0.1428571
}
# Note: This might fail if looking at too large of a matrix
if (exact == TRUE) {
Z = solve(S_xx, t(X))
P_ii = diag(X %*% Z)
return(P_ii)
# P_ii[1:10]
# [1] 0.3344063 0.3336252 0.3335790 0.3339978 0.3337960 0.3333515 0.3333510 0.3335557 0.3336988 0.2001362
# mean(P_ii)
# [1] 0.2731241
}
if (exact == FALSE) {
n <- nrow(X)
k <- ncol(X)
# TODO: think about using fixest object to calculate \hat{\beta} = (X'X)^{-1} X' R_pi for i = 1, ..., p
# Using speed heuristics based on n*p and trying
# Solve for Z = (X'X)^{-1} (X' R_p)
if (n*p <= 1000000) {
# Fastest for small n*p (b/c of my foor loop)
Rp <- matrix(rbinom(n * p, 1, 0.5) * 2 - 1, nrow = p, ncol = n)
X_Rp <- tcrossprod(t(X), Rp)
Z = solve(S_xx, X_Rp)
# P_ii = 1/p || X_i' Z ||^2
P_ii = colSums(crossprod(Z, t(X))^2) / p
} else if (n*p <= 10000000) {
# Memory efficient but Slower
X_Rp <- matrix(0, nrow = k, ncol = p)
for (j in 1:p) {
Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
X_Rp[, j] <- tcrossprod(t(X), Rp_j)
}
Z = solve(S_xx, X_Rp)
# P_ii = 1/p || X_i' Z ||^2
# Needed for speed. Dense already, so no memory difference
Z_dense = as.matrix(Z)
P_ii = rep(0, nrow(X))
for (j in 1:ncol(Z_dense)) {
Zj = Z_dense[, j, drop = FALSE]
P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
}
} else {
# Slightly slower, but very memory efficient
P_ii = rep(0, nrow(X))
for (j in 1:p) {
Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
X_Rp_j <- tcrossprod(t(X), Rp_j)
Zj <- solve(S_xx, X_Rp_j)
P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
}
}
return(P_ii)
# Checking, when p = 1000000, n = 3665 (1000 FEs)
# P_ii[1:10]
# [1] 0.3338723 0.3330020 0.3329582 0.3340253 0.3338307 0.3333825 0.3327888 0.3329766 0.3331747 0.2001476
# mean(P_ii)
# [1] 0.2731081
}
}
# Test -------------------------------------------------------------------------
N = 100000
var_fes = 10
var_eps = 4
# Ensure P_ii exists
N_i = pmax(2, rpois(N, 3))
# Create dataset
df = data.table(id = rep(1:N, N_i))
df[, j := 1:.N, by = id]
df[, fe_i := rnorm(1, 0, sd = sqrt(var_fes)), by = id]
df[, x := rnorm(.N, mean = fe_i, sd = 1)]
df[, y := x * 1 + fe_i + rnorm(.N, 0, sqrt(var_eps))]
# FE only model ----------------------------------------------------------------
est = feols(y ~ 0 | id, data = df)
tic()
h = hatvalues(est)
toc()
#> 0.46 sec elapsed
# Covariates -------------------------------------------------------------------
est = feols(y ~ x | id, data = df)
tic()
h_approx_1k = hatvalues(est, exact = FALSE, p = 1000)
toc()
#> 22.377 sec elapsed
Created on 2023-05-08 with reprex v2.0.2
# `hatvalues` for feols models | |
hatvalues.fixest <- function(model, exact = TRUE, p = 1000) { | |
X = did2s:::sparse_model_matrix(df, model) | |
# If fixed effects only, flag | |
fe_only = is.null(X$mat_RHS) | |
X = cbind(X$mat_RHS, X$mat_FE) | |
S_xx = crossprod(X) | |
# If there are only fixed effects, then can do this *super fast* with FEs | |
if (fe_only == TRUE) { | |
# https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i | |
U = Matrix::chol(S_xx) | |
Z = solve(t(U), t(X)) | |
P_ii = Matrix::colSums(Z ^ 2) | |
return(P_ii) | |
# P_ii[1:10] | |
# [1] 0.2000000 0.2000000 0.2000000 0.2000000 0.2000000 0.3333333 0.3333333 0.3333333 0.1428571 0.1428571 | |
} | |
# Note: This might fail if looking at too large of a matrix | |
if (exact == TRUE) { | |
Z = solve(S_xx, t(X)) | |
P_ii = diag(X %*% Z) | |
return(P_ii) | |
} | |
if (exact == FALSE) { | |
n <- nrow(X) | |
k <- ncol(X) | |
# TODO: think about using fixest object to calculate \hat{\beta} = (X'X)^{-1} X' R_pi for i = 1, ..., p | |
# Using speed heuristics based on n*p and trying | |
# Solve for Z = (X'X)^{-1} (X' R_p) | |
if (n*p <= 1000000) { | |
# Fastest for small n*p (b/c of my foor loop) | |
Rp <- matrix(rbinom(n * p, 1, 0.5) * 2 - 1, nrow = p, ncol = n) | |
X_Rp <- tcrossprod(t(X), Rp) | |
Z = solve(S_xx, X_Rp) | |
# P_ii = 1/p || X_i' Z ||^2 | |
P_ii = colSums(crossprod(Z, t(X))^2) / p | |
} else if (n*p <= 10000000) { | |
# Memory efficient but Slower | |
X_Rp <- matrix(0, nrow = k, ncol = p) | |
for (j in 1:p) { | |
Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n) | |
X_Rp[, j] <- tcrossprod(t(X), Rp_j) | |
} | |
Z = solve(S_xx, X_Rp) | |
# P_ii = 1/p || X_i' Z ||^2 | |
# Needed for speed. Dense already, so no memory difference | |
Z_dense = as.matrix(Z) | |
P_ii = rep(0, nrow(X)) | |
for (j in 1:ncol(Z_dense)) { | |
Zj = Z_dense[, j, drop = FALSE] | |
P_ii = P_ii + as.numeric(X %*% Zj)^2 / p | |
} | |
} else { | |
# Slightly slower, but very memory efficient | |
P_ii = rep(0, nrow(X)) | |
for (j in 1:p) { | |
Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n) | |
X_Rp_j <- tcrossprod(t(X), Rp_j) | |
Zj <- solve(S_xx, X_Rp_j) | |
P_ii = P_ii + as.numeric(X %*% Zj)^2 / p | |
} | |
} | |
return(P_ii) | |
} | |
} | |