Skip to content

Instantly share code, notes, and snippets.

@khakieconomics
Last active August 9, 2018 17:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save khakieconomics/423825e2b83c448d95b5f7198b532fdd to your computer and use it in GitHub Desktop.
Save khakieconomics/423825e2b83c448d95b5f7198b532fdd to your computer and use it in GitHub Desktop.
library(rstan); library(tidyverse)
# A utility function
extract_pars <- function(mle_fit, pars) {
unlist(lapply(pars, function(n) mle_fit$par[grepl(pattern = n, x = names(mle_fit$par))]))
}
# Simulate fake data ------------------------------------------------------
N <- 1000
P <- 10
P2 <- 5
X = matrix(rnorm(N*P, 0, .5), N, P)
Z_raw = matrix(rnorm(N*P2, 0, .5), N, P2)
gamma <- rnorm(P + P2)
Z <- cbind(X, Z_raw)
alpha_1 <- rnorm(1)
alpha <- rnorm(1)
beta <- rnorm(P)
sigma_u <- 2
# Draw the structural shocks
structural_shocks <- MASS::mvrnorm(N, c(0, 0), matrix(c(1, sigma_u * .5, sigma_u * .5, sigma_u^2), 2, 2))
# Simulate reporting a salary (participation) and the wage
participation <- as.numeric((alpha_1 + as.matrix(Z) %*% gamma + structural_shocks[,1])> 0)
wage <- participation * (alpha + X %*% beta + structural_shocks[,2])
# Record the actual probabilities
actual_p <- pnorm(alpha_1 + as.matrix(Z) %*% gamma)
# Create the data list
data_list <- list(N = N,
P = P,
P2 = P2,
X = X, Z_raw = Z_raw,
Y = as.numeric(wage),
participation = participation,
estimate_model = 1)
# Comparing our Bayesian fit to Heckman correction ---------------------------------------
# Vanilla OLS
ols_fit <- lm(wage ~ ., data = as.data.frame(X) ,subset = participation==1)
plot(coef(ols_fit), c(alpha, beta))
abline(0, 1)
# Heckman correction
first_stage <- glm(participation ~ ., data = as.data.frame(Z), family = binomial(link = "probit"))
yhat <- predict(first_stage, type = "response")
lambda <- dnorm(yhat)/pnorm(yhat)
second_stage <- lm(wage ~ . + lambda, data = as.data.frame(X), subset = participation == 1)
plot(coef(second_stage)[1:(P+1)], c(alpha, beta))
abline(0, 1)
# Compiled and fit the model ----------------------------------------------
compiled_model <- stan_model("heckman_correction.stan")
fit_sim <- sampling(compiled_model, data = data_list, chains = 4, iter = 1000, cores = 4)
fit_sim
# Compare to posterior means
plot(get_posterior_mean(fit_sim, pars = c("alpha", "beta"))[,5], c(alpha, beta))
abline(0, 1)
# Now fit by penalized likelihood
fit_mle <- optimizing(compiled_model, data = data_list, init = 0)
plot(extract_pars(fit_mle, c("alpha", "beta"))[-2], c(alpha, beta))
abline(0, 1)
plot(extract_pars(fit_mle, c("p\\[")),actual_p)
abline(0, 1, col = 2)
# Some rough plots of estimates vs generative actuals ---------------------
confints <- broom::tidy(fit_sim, pars = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"), conf.int = T)
confints_p <- broom::tidy(fit_sim, pars = "p", conf.int = T)
data_frame(parameter = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"),
values = c(alpha, alpha_1, list(beta), list(gamma), 0.5, sigma_u)) %>%
unnest() %>%
mutate(estimates = confints$estimate,
lower = confints$conf.low,
upper = confints$conf.high) %>%
ggplot(aes(x = values, y = estimates, colour = parameter)) +
geom_point() +
geom_linerange(aes(ymin = lower, ymax = upper)) +
geom_abline(aes(intercept= 0, slope = 1))
data_frame(parameter = "p",
values = list(actual_p)) %>%
unnest() %>%
mutate(estimates = confints_p$estimate,
lower = confints_p$conf.low,
upper = confints_p$conf.high,
misestimated = values < lower | values > upper) %>%
ggplot(aes(x = values, y = estimates, colour = misestimated, alpha = (misestimated+ 1)/2)) +
geom_point() +
scale_color_manual(values = c("black", "red")) +
guides(alpha = F) +
geom_linerange(aes(ymin = lower, ymax = upper)) +
geom_abline(aes(intercept= 0, slope = 1))
# What proportion of individual-level estimates fall within bounds?
# NB we probably need more iterations
data_frame(parameter = "p",
values = list(actual_p)) %>%
unnest() %>%
mutate(estimates = confints_p$estimate,
lower = confints_p$conf.low,
upper = confints_p$conf.high,
misestimated = values < lower | values > upper) %>%
summarise(mean(misestimated))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment