Skip to content

Instantly share code, notes, and snippets.

@kylebutts
Last active May 8, 2023 18:04
Show Gist options
  • Save kylebutts/ea05599a99b86e29ad2f39956e339ab4 to your computer and use it in GitHub Desktop.
Save kylebutts/ea05599a99b86e29ad2f39956e339ab4 to your computer and use it in GitHub Desktop.
`hatvalues.fixest` implementation

Fast implementation of hatvalues.fixest. This calculates (or approximates) the diagonals of the projection matrix $X (X' X)^{-1} X'$.

Comparing p = 500 and 1M fixed effects to Kline, Saggio, and Sølvsten in their computational appendix. They took ~250 seconds to approximate with parallelization. This takes 100 seconds without parallelization. With just fixed effects and no covariates, 2.33 seconds.

library(data.table)
library(fixest)
library(Matrix)
library(MatrixExtra)
#> 'MatrixExtra' modifies important behaviors from 'Matrix'. See ?MatrixExtra-options.
library(tictoc)
#> 
#> Attaching package: 'tictoc'
#> The following object is masked from 'package:data.table':
#> 
#>     shift

# `hatvalues` for feols models -------------------------------------------------
hatvalues.fixest <- function(model, exact = TRUE, p = 1000) {
  
  X = did2s:::sparse_model_matrix(df, model)

  # If fixed effects only, flag
  fe_only = is.null(X$mat_RHS)
  X = cbind(X$mat_RHS, X$mat_FE)
  S_xx = crossprod(X)
  
  # If there are only fixed effects, then can do this *super fast* with FEs
  if (fe_only == TRUE) {
    # https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i
    U = Matrix::chol(S_xx)
    Z = solve(t(U), t(X))
    P_ii = Matrix::colSums(Z ^ 2)
    
    return(P_ii)
    # P_ii[1:10]
    # [1] 0.2000000 0.2000000 0.2000000 0.2000000 0.2000000 0.3333333 0.3333333 0.3333333 0.1428571 0.1428571
  } 
    
  # Note: This might fail if looking at too large of a matrix
  if (exact == TRUE) {
    Z = solve(S_xx, t(X))
    P_ii = diag(X %*% Z)

    return(P_ii)
    # P_ii[1:10]
    # [1] 0.3344063 0.3336252 0.3335790 0.3339978 0.3337960 0.3333515 0.3333510 0.3335557 0.3336988 0.2001362
    # mean(P_ii)
    # [1] 0.2731241
  }

  if (exact == FALSE) {
    
    n <- nrow(X)
    k <- ncol(X)

    # TODO: think about using fixest object to calculate \hat{\beta} = (X'X)^{-1} X' R_pi for i = 1, ..., p

    # Using speed heuristics based on n*p and trying
    # Solve for Z = (X'X)^{-1} (X' R_p)
    if (n*p <= 1000000) {
      # Fastest for small n*p (b/c of my foor loop)

      Rp <- matrix(rbinom(n * p, 1, 0.5) * 2 - 1, nrow = p, ncol = n)
      X_Rp <- tcrossprod(t(X), Rp)
      Z = solve(S_xx, X_Rp)
    
      # P_ii = 1/p || X_i' Z ||^2
      P_ii = colSums(crossprod(Z, t(X))^2) / p 

    } else if (n*p <= 10000000) {
      # Memory efficient but Slower

      X_Rp <- matrix(0, nrow = k, ncol = p)
      for (j in 1:p) {
        Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
        X_Rp[, j] <- tcrossprod(t(X), Rp_j)
      }
      Z = solve(S_xx, X_Rp)

      # P_ii = 1/p || X_i' Z ||^2
      # Needed for speed. Dense already, so no memory difference
      Z_dense = as.matrix(Z)

      P_ii = rep(0, nrow(X))
      for (j in 1:ncol(Z_dense)) {
        Zj = Z_dense[, j, drop = FALSE]
        P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
      }

    } else {
      # Slightly slower, but very memory efficient
      
      P_ii = rep(0, nrow(X))
      for (j in 1:p) {
        Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
        X_Rp_j <- tcrossprod(t(X), Rp_j)
        Zj <- solve(S_xx, X_Rp_j)

        P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
      }
    }

    return(P_ii)

    # Checking, when p = 1000000, n = 3665 (1000 FEs)
    # P_ii[1:10]
    # [1] 0.3338723 0.3330020 0.3329582 0.3340253 0.3338307 0.3333825 0.3327888 0.3329766 0.3331747 0.2001476
    # mean(P_ii)
    # [1] 0.2731081
  }
}

# Test -------------------------------------------------------------------------

N = 1000000
var_fes = 10
var_eps = 4

# Ensure P_ii exists
N_i = pmax(2, rpois(N, 3))

# Create dataset
df = data.table(id = rep(1:N, N_i))
df[, j := 1:.N, by = id]
df[, fe_i := rnorm(1, 0, sd = sqrt(var_fes)), by = id]
df[, x := rnorm(.N, mean = fe_i, sd = 1)]
df[, y := x * 1 + fe_i + rnorm(.N, 0, sqrt(var_eps))]

# FE only model ----------------------------------------------------------------
est = feols(y ~ 0 | id, data = df)

tic()
h = hatvalues(est)
toc()
#> 2.333 sec elapsed


# Covariates -------------------------------------------------------------------
est = feols(y ~ x | id, data = df)

tic()
h_approx_1k = hatvalues(est, exact = FALSE, p = 500)
toc()
#> 100.744 sec elapsed

Created on 2023-05-08 with reprex v2.0.2

This uses the JLA approximation as described in Kline, Saggio, and Sølvsten. The hatleverage function has two parameters. exact which determines whether to solve for $P_ii$ exactly or with an approximation. The parameter p is used to determine the number of draws of the Rademacher weights will be used and larger values increase the accuracy of estimation. Even for small values of p, the average $P_ii$ is very precise and there is very little deviations elsewhere.

One thing to note is that estimated $P_ii$ can be greater than 1. They propose an improved estimator that fixes this, but I haven't implemented it yet.

library(data.table)
library(fixest)
library(Matrix)
library(MatrixExtra)
#> 'MatrixExtra' modifies important behaviors from 'Matrix'. See ?MatrixExtra-options.
library(tictoc)
#> 
#> Attaching package: 'tictoc'
#> The following object is masked from 'package:data.table':
#> 
#>     shift

# `hatvalues` for feols models -------------------------------------------------
hatvalues.fixest <- function(model, exact = TRUE, p = 1000) {
  
  X = did2s:::sparse_model_matrix(df, model)

  # If fixed effects only, flag
  fe_only = is.null(X$mat_RHS)
  X = cbind(X$mat_RHS, X$mat_FE)
  S_xx = crossprod(X)
  
  # If there are only fixed effects, then can do this *super fast* with FEs
  if (fe_only == TRUE) {
    # https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i
    U = Matrix::chol(S_xx)
    Z = solve(t(U), t(X))
    P_ii = Matrix::colSums(Z ^ 2)
    
    return(P_ii)
    # P_ii[1:10]
    # [1] 0.2000000 0.2000000 0.2000000 0.2000000 0.2000000 0.3333333 0.3333333 0.3333333 0.1428571 0.1428571
  } 
    
  # Note: This might fail if looking at too large of a matrix
  if (exact == TRUE) {
    Z = solve(S_xx, t(X))
    P_ii = diag(X %*% Z)

    return(P_ii)
    # P_ii[1:10]
    # [1] 0.3344063 0.3336252 0.3335790 0.3339978 0.3337960 0.3333515 0.3333510 0.3335557 0.3336988 0.2001362
    # mean(P_ii)
    # [1] 0.2731241
  }

  if (exact == FALSE) {
    
    n <- nrow(X)
    k <- ncol(X)

    # TODO: think about using fixest object to calculate \hat{\beta} = (X'X)^{-1} X' R_pi for i = 1, ..., p

    # Using speed heuristics based on n*p and trying
    # Solve for Z = (X'X)^{-1} (X' R_p)
    if (n*p <= 1000000) {
      # Fastest for small n*p (b/c of my foor loop)

      Rp <- matrix(rbinom(n * p, 1, 0.5) * 2 - 1, nrow = p, ncol = n)
      X_Rp <- tcrossprod(t(X), Rp)
      Z = solve(S_xx, X_Rp)
    
      # P_ii = 1/p || X_i' Z ||^2
      P_ii = colSums(crossprod(Z, t(X))^2) / p 

    } else if (n*p <= 10000000) {
      # Memory efficient but Slower

      X_Rp <- matrix(0, nrow = k, ncol = p)
      for (j in 1:p) {
        Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
        X_Rp[, j] <- tcrossprod(t(X), Rp_j)
      }
      Z = solve(S_xx, X_Rp)

      # P_ii = 1/p || X_i' Z ||^2
      # Needed for speed. Dense already, so no memory difference
      Z_dense = as.matrix(Z)

      P_ii = rep(0, nrow(X))
      for (j in 1:ncol(Z_dense)) {
        Zj = Z_dense[, j, drop = FALSE]
        P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
      }

    } else {
      # Slightly slower, but very memory efficient
      
      P_ii = rep(0, nrow(X))
      for (j in 1:p) {
        Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
        X_Rp_j <- tcrossprod(t(X), Rp_j)
        Zj <- solve(S_xx, X_Rp_j)

        P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
      }
    }

    return(P_ii)

    # Checking, when p = 1000000, n = 3665 (1000 FEs)
    # P_ii[1:10]
    # [1] 0.3338723 0.3330020 0.3329582 0.3340253 0.3338307 0.3333825 0.3327888 0.3329766 0.3331747 0.2001476
    # mean(P_ii)
    # [1] 0.2731081
  }
}

# Test -------------------------------------------------------------------------

N = 1000
var_fes = 10
var_eps = 4

# Ensure P_ii exists
N_i = pmax(2, rpois(N, 3))

# Create dataset
df = data.table(id = rep(1:N, N_i))
df[, j := 1:.N, by = id]
df[, fe_i := rnorm(1, 0, sd = sqrt(var_fes)), by = id]
df[, x := rnorm(.N, mean = fe_i, sd = 1)]
df[, y := x * 1 + fe_i + rnorm(.N, 0, sqrt(var_eps))]

# FE only model ----------------------------------------------------------------
est = feols(y ~ 0 | id, data = df)

tic()
h = hatvalues(est)
toc()
#> 0.048 sec elapsed


# Covariates -------------------------------------------------------------------
est = feols(y ~ x | id, data = df)

tic()
h_exact = hatvalues(est)
toc()
#> 0.532 sec elapsed
tic()
h_approx_1k = hatvalues(est, exact = FALSE, p = 1000)
toc()
#> 0.974 sec elapsed
tic()
h_approx_10k = hatvalues(est, exact = FALSE, p = 10000)
toc()
#> 9.101 sec elapsed

# How good is the approximation?
diff_1k = h_exact - h_approx_1k
mean(diff_1k)
#> [1] 0.0001043958
sd(diff_1k)
#> [1] 0.01128287

diff_10k = h_exact - h_approx_10k
mean(diff_10k)
#> [1] -0.0002276724
sd(diff_10k)
#> [1] 0.003602415

Created on 2023-05-08 with reprex v2.0.2

library(data.table)
library(fixest)
library(Matrix)
library(MatrixExtra)
#> 'MatrixExtra' modifies important behaviors from 'Matrix'. See ?MatrixExtra-options.
library(tictoc)
#> 
#> Attaching package: 'tictoc'
#> The following object is masked from 'package:data.table':
#> 
#>     shift

# `hatvalues` for feols models -------------------------------------------------
hatvalues.fixest <- function(model, exact = TRUE, p = 1000) {
  
  X = did2s:::sparse_model_matrix(df, model)

  # If fixed effects only, flag
  fe_only = is.null(X$mat_RHS)
  X = cbind(X$mat_RHS, X$mat_FE)
  S_xx = crossprod(X)
  
  # If there are only fixed effects, then can do this *super fast* with FEs
  if (fe_only == TRUE) {
    # https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i
    U = Matrix::chol(S_xx)
    Z = solve(t(U), t(X))
    P_ii = Matrix::colSums(Z ^ 2)
    
    return(P_ii)
    # P_ii[1:10]
    # [1] 0.2000000 0.2000000 0.2000000 0.2000000 0.2000000 0.3333333 0.3333333 0.3333333 0.1428571 0.1428571
  } 
    
  # Note: This might fail if looking at too large of a matrix
  if (exact == TRUE) {
    Z = solve(S_xx, t(X))
    P_ii = diag(X %*% Z)

    return(P_ii)
    # P_ii[1:10]
    # [1] 0.3344063 0.3336252 0.3335790 0.3339978 0.3337960 0.3333515 0.3333510 0.3335557 0.3336988 0.2001362
    # mean(P_ii)
    # [1] 0.2731241
  }

  if (exact == FALSE) {
    
    n <- nrow(X)
    k <- ncol(X)

    # TODO: think about using fixest object to calculate \hat{\beta} = (X'X)^{-1} X' R_pi for i = 1, ..., p

    # Using speed heuristics based on n*p and trying
    # Solve for Z = (X'X)^{-1} (X' R_p)
    if (n*p <= 1000000) {
      # Fastest for small n*p (b/c of my foor loop)

      Rp <- matrix(rbinom(n * p, 1, 0.5) * 2 - 1, nrow = p, ncol = n)
      X_Rp <- tcrossprod(t(X), Rp)
      Z = solve(S_xx, X_Rp)
    
      # P_ii = 1/p || X_i' Z ||^2
      P_ii = colSums(crossprod(Z, t(X))^2) / p 

    } else if (n*p <= 10000000) {
      # Memory efficient but Slower

      X_Rp <- matrix(0, nrow = k, ncol = p)
      for (j in 1:p) {
        Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
        X_Rp[, j] <- tcrossprod(t(X), Rp_j)
      }
      Z = solve(S_xx, X_Rp)

      # P_ii = 1/p || X_i' Z ||^2
      # Needed for speed. Dense already, so no memory difference
      Z_dense = as.matrix(Z)

      P_ii = rep(0, nrow(X))
      for (j in 1:ncol(Z_dense)) {
        Zj = Z_dense[, j, drop = FALSE]
        P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
      }

    } else {
      # Slightly slower, but very memory efficient
      
      P_ii = rep(0, nrow(X))
      for (j in 1:p) {
        Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
        X_Rp_j <- tcrossprod(t(X), Rp_j)
        Zj <- solve(S_xx, X_Rp_j)

        P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
      }
    }

    return(P_ii)

    # Checking, when p = 1000000, n = 3665 (1000 FEs)
    # P_ii[1:10]
    # [1] 0.3338723 0.3330020 0.3329582 0.3340253 0.3338307 0.3333825 0.3327888 0.3329766 0.3331747 0.2001476
    # mean(P_ii)
    # [1] 0.2731081
  }
}

# Test -------------------------------------------------------------------------

N = 100000
var_fes = 10
var_eps = 4

# Ensure P_ii exists
N_i = pmax(2, rpois(N, 3))

# Create dataset
df = data.table(id = rep(1:N, N_i))
df[, j := 1:.N, by = id]
df[, fe_i := rnorm(1, 0, sd = sqrt(var_fes)), by = id]
df[, x := rnorm(.N, mean = fe_i, sd = 1)]
df[, y := x * 1 + fe_i + rnorm(.N, 0, sqrt(var_eps))]

# FE only model ----------------------------------------------------------------
est = feols(y ~ 0 | id, data = df)

tic()
h = hatvalues(est)
toc()
#> 0.46 sec elapsed


# Covariates -------------------------------------------------------------------
est = feols(y ~ x | id, data = df)

tic()
h_approx_1k = hatvalues(est, exact = FALSE, p = 1000)
toc()
#> 22.377 sec elapsed

Created on 2023-05-08 with reprex v2.0.2

# `hatvalues` for feols models
hatvalues.fixest <- function(model, exact = TRUE, p = 1000) {
X = did2s:::sparse_model_matrix(df, model)
# If fixed effects only, flag
fe_only = is.null(X$mat_RHS)
X = cbind(X$mat_RHS, X$mat_FE)
S_xx = crossprod(X)
# If there are only fixed effects, then can do this *super fast* with FEs
if (fe_only == TRUE) {
# https://stackoverflow.com/questions/39533785/how-to-compute-diagx-solvea-tx-efficiently-without-taking-matrix-i
U = Matrix::chol(S_xx)
Z = solve(t(U), t(X))
P_ii = Matrix::colSums(Z ^ 2)
return(P_ii)
# P_ii[1:10]
# [1] 0.2000000 0.2000000 0.2000000 0.2000000 0.2000000 0.3333333 0.3333333 0.3333333 0.1428571 0.1428571
}
# Note: This might fail if looking at too large of a matrix
if (exact == TRUE) {
Z = solve(S_xx, t(X))
P_ii = diag(X %*% Z)
return(P_ii)
}
if (exact == FALSE) {
n <- nrow(X)
k <- ncol(X)
# TODO: think about using fixest object to calculate \hat{\beta} = (X'X)^{-1} X' R_pi for i = 1, ..., p
# Using speed heuristics based on n*p and trying
# Solve for Z = (X'X)^{-1} (X' R_p)
if (n*p <= 1000000) {
# Fastest for small n*p (b/c of my foor loop)
Rp <- matrix(rbinom(n * p, 1, 0.5) * 2 - 1, nrow = p, ncol = n)
X_Rp <- tcrossprod(t(X), Rp)
Z = solve(S_xx, X_Rp)
# P_ii = 1/p || X_i' Z ||^2
P_ii = colSums(crossprod(Z, t(X))^2) / p
} else if (n*p <= 10000000) {
# Memory efficient but Slower
X_Rp <- matrix(0, nrow = k, ncol = p)
for (j in 1:p) {
Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
X_Rp[, j] <- tcrossprod(t(X), Rp_j)
}
Z = solve(S_xx, X_Rp)
# P_ii = 1/p || X_i' Z ||^2
# Needed for speed. Dense already, so no memory difference
Z_dense = as.matrix(Z)
P_ii = rep(0, nrow(X))
for (j in 1:ncol(Z_dense)) {
Zj = Z_dense[, j, drop = FALSE]
P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
}
} else {
# Slightly slower, but very memory efficient
P_ii = rep(0, nrow(X))
for (j in 1:p) {
Rp_j <- matrix(rbinom(n, 1, 0.5) * 2 - 1, nrow = 1, ncol = n)
X_Rp_j <- tcrossprod(t(X), Rp_j)
Zj <- solve(S_xx, X_Rp_j)
P_ii = P_ii + as.numeric(X %*% Zj)^2 / p
}
}
return(P_ii)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment