khakieconomics/run_heckman_correction.R

## run_heckman_correction.R
library(rstan); library(tidyverse)

# A utility function
extract_pars <- function(mle_fit, pars) {
  unlist(lapply(pars, function(n) mle_fit$par[grepl(pattern = n, x = names(mle_fit$par))]))
}

# Simulate fake data ------------------------------------------------------


N <- 1000
P <- 10
P2 <- 5
X = matrix(rnorm(N*P, 0, .5), N, P)
Z_raw = matrix(rnorm(N*P2, 0, .5), N, P2)


gamma <- rnorm(P + P2)
Z <- cbind(X, Z_raw)
alpha_1 <- rnorm(1)
alpha <- rnorm(1)
beta <- rnorm(P)
sigma_u <- 2

# Draw the structural shocks
structural_shocks <- MASS::mvrnorm(N, c(0, 0), matrix(c(1, sigma_u * .5, sigma_u * .5, sigma_u^2), 2, 2))

# Simulate reporting a salary (participation) and the wage
participation <- as.numeric((alpha_1 + as.matrix(Z) %*% gamma + structural_shocks[,1])> 0)
wage <- participation * (alpha + X %*% beta + structural_shocks[,2])

# Record the actual probabilities
actual_p <- pnorm(alpha_1 + as.matrix(Z) %*% gamma)

# Create the data list
data_list <- list(N = N,
                  P = P,
                  P2 = P2,
                  X = X, Z_raw = Z_raw,
                  Y = as.numeric(wage),
                  participation = participation,
                  estimate_model = 1)


# Comparing our Bayesian fit to Heckman correction ---------------------------------------

# Vanilla OLS
ols_fit <- lm(wage ~ ., data = as.data.frame(X) ,subset = participation==1)
plot(coef(ols_fit), c(alpha, beta))
abline(0, 1)

# Heckman correction
first_stage <- glm(participation ~ ., data = as.data.frame(Z), family = binomial(link = "probit"))
yhat <- predict(first_stage, type = "response")
lambda <- dnorm(yhat)/pnorm(yhat)
second_stage <- lm(wage ~ . + lambda, data = as.data.frame(X), subset = participation == 1)
plot(coef(second_stage)[1:(P+1)], c(alpha, beta))
abline(0, 1)

# Compiled and fit the model ----------------------------------------------

compiled_model <- stan_model("heckman_correction.stan")


fit_sim <- sampling(compiled_model, data = data_list, chains = 4, iter = 1000, cores = 4)
fit_sim


# Compare to posterior means

plot(get_posterior_mean(fit_sim, pars = c("alpha", "beta"))[,5], c(alpha, beta))
abline(0, 1)


# Now fit by penalized likelihood

fit_mle <- optimizing(compiled_model, data = data_list, init = 0)
plot(extract_pars(fit_mle, c("alpha", "beta"))[-2], c(alpha, beta))
abline(0, 1)

plot(extract_pars(fit_mle, c("p\\[")),actual_p)
abline(0, 1, col = 2)


# Some rough plots of estimates vs generative actuals ---------------------

confints <- broom::tidy(fit_sim, pars = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"), conf.int = T)

confints_p <- broom::tidy(fit_sim, pars = "p", conf.int = T)

data_frame(parameter = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"),
           values = c(alpha, alpha_1, list(beta), list(gamma), 0.5, sigma_u)) %>%
  unnest() %>%
  mutate(estimates = confints$estimate,
         lower = confints$conf.low,
         upper = confints$conf.high) %>%
  ggplot(aes(x = values, y = estimates, colour = parameter)) +
  geom_point() +
  geom_linerange(aes(ymin = lower, ymax = upper)) +
  geom_abline(aes(intercept= 0, slope = 1))


data_frame(parameter = "p",
           values = list(actual_p)) %>%
  unnest() %>%
  mutate(estimates = confints_p$estimate,
         lower = confints_p$conf.low,
         upper = confints_p$conf.high,
         misestimated = values < lower | values > upper) %>%
  ggplot(aes(x = values, y = estimates, colour = misestimated, alpha = (misestimated+ 1)/2)) +
  geom_point() +
  scale_color_manual(values = c("black", "red")) +
  guides(alpha = F) +
  geom_linerange(aes(ymin = lower, ymax = upper)) +
  geom_abline(aes(intercept= 0, slope = 1))


# What proportion of individual-level estimates fall within bounds?
# NB we probably need more iterations
data_frame(parameter = "p",
           values = list(actual_p)) %>%
  unnest() %>%
  mutate(estimates = confints_p$estimate,
         lower = confints_p$conf.low,
         upper = confints_p$conf.high,
         misestimated = values < lower | values > upper) %>%
  summarise(mean(misestimated))
	library(rstan); library(tidyverse)

	# A utility function
	extract_pars <- function(mle_fit, pars) {
	unlist(lapply(pars, function(n) mle_fit$par[grepl(pattern = n, x = names(mle_fit$par))]))
	}

	# Simulate fake data ------------------------------------------------------


	N <- 1000
	P <- 10
	P2 <- 5
	X = matrix(rnorm(N*P, 0, .5), N, P)
	Z_raw = matrix(rnorm(N*P2, 0, .5), N, P2)


	gamma <- rnorm(P + P2)
	Z <- cbind(X, Z_raw)
	alpha_1 <- rnorm(1)
	alpha <- rnorm(1)
	beta <- rnorm(P)
	sigma_u <- 2

	# Draw the structural shocks
	structural_shocks <- MASS::mvrnorm(N, c(0, 0), matrix(c(1, sigma_u * .5, sigma_u * .5, sigma_u^2), 2, 2))

	# Simulate reporting a salary (participation) and the wage
	participation <- as.numeric((alpha_1 + as.matrix(Z) %*% gamma + structural_shocks[,1])> 0)
	wage <- participation * (alpha + X %*% beta + structural_shocks[,2])

	# Record the actual probabilities
	actual_p <- pnorm(alpha_1 + as.matrix(Z) %*% gamma)

	# Create the data list
	data_list <- list(N = N,
	P = P,
	P2 = P2,
	X = X, Z_raw = Z_raw,
	Y = as.numeric(wage),
	participation = participation,
	estimate_model = 1)


	# Comparing our Bayesian fit to Heckman correction ---------------------------------------

	# Vanilla OLS
	ols_fit <- lm(wage ~ ., data = as.data.frame(X) ,subset = participation==1)
	plot(coef(ols_fit), c(alpha, beta))
	abline(0, 1)

	# Heckman correction
	first_stage <- glm(participation ~ ., data = as.data.frame(Z), family = binomial(link = "probit"))
	yhat <- predict(first_stage, type = "response")
	lambda <- dnorm(yhat)/pnorm(yhat)
	second_stage <- lm(wage ~ . + lambda, data = as.data.frame(X), subset = participation == 1)
	plot(coef(second_stage)[1:(P+1)], c(alpha, beta))
	abline(0, 1)

	# Compiled and fit the model ----------------------------------------------

	compiled_model <- stan_model("heckman_correction.stan")


	fit_sim <- sampling(compiled_model, data = data_list, chains = 4, iter = 1000, cores = 4)
	fit_sim


	# Compare to posterior means

	plot(get_posterior_mean(fit_sim, pars = c("alpha", "beta"))[,5], c(alpha, beta))
	abline(0, 1)


	# Now fit by penalized likelihood

	fit_mle <- optimizing(compiled_model, data = data_list, init = 0)
	plot(extract_pars(fit_mle, c("alpha", "beta"))[-2], c(alpha, beta))
	abline(0, 1)

	plot(extract_pars(fit_mle, c("p\\[")),actual_p)
	abline(0, 1, col = 2)


	# Some rough plots of estimates vs generative actuals ---------------------

	confints <- broom::tidy(fit_sim, pars = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"), conf.int = T)

	confints_p <- broom::tidy(fit_sim, pars = "p", conf.int = T)

	data_frame(parameter = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"),
	values = c(alpha, alpha_1, list(beta), list(gamma), 0.5, sigma_u)) %>%
	unnest() %>%
	mutate(estimates = confints$estimate,
	lower = confints$conf.low,
	upper = confints$conf.high) %>%
	ggplot(aes(x = values, y = estimates, colour = parameter)) +
	geom_point() +
	geom_linerange(aes(ymin = lower, ymax = upper)) +
	geom_abline(aes(intercept= 0, slope = 1))


	data_frame(parameter = "p",
	values = list(actual_p)) %>%
	unnest() %>%
	mutate(estimates = confints_p$estimate,
	lower = confints_p$conf.low,
	upper = confints_p$conf.high,
	misestimated = values < lower \| values > upper) %>%
	ggplot(aes(x = values, y = estimates, colour = misestimated, alpha = (misestimated+ 1)/2)) +
	geom_point() +
	scale_color_manual(values = c("black", "red")) +
	guides(alpha = F) +
	geom_linerange(aes(ymin = lower, ymax = upper)) +
	geom_abline(aes(intercept= 0, slope = 1))


	# What proportion of individual-level estimates fall within bounds?
	# NB we probably need more iterations
	data_frame(parameter = "p",
	values = list(actual_p)) %>%
	unnest() %>%
	mutate(estimates = confints_p$estimate,
	lower = confints_p$conf.low,
	upper = confints_p$conf.high,
	misestimated = values < lower \| values > upper) %>%
	summarise(mean(misestimated))