khakieconomics/run_biprobit.R

## run_biprobit.R
library(rstan); library(dplyr); library(ggplot2); library(reshape2)
options(mc.cores = parallel::detectCores())

set.seed(49)
# Observations
N <- 2000

# Draw errors
rho <- 0.5
Sigma <- matrix(c(1, rho, rho, 1), 2, 2)
errors <- MASS::mvrnorm(N, c(0, 0), Sigma)

# Draw some fake covariates
P <- 5
P2 <- 3
X <- cbind(1, matrix(rnorm((P-1)*N), N, P-1))
Z <- matrix(rnorm(P2*N), N, P2)

# Some fake parameters
beta <- rnorm(P + P2)
gamma <- rnorm(P + 1)

# Make Y
Ystar1 <- cbind(X, Z) %*% beta + errors[,1]
Y1 = as.numeric(Ystar1 > 0 )

# Make covariates for second stage
X2 <- cbind(Y1, X)

# Make outcome
Ystar2 <- X2 %*% gamma + errors[,2]
Y2 <- as.numeric(Ystar2 > 0)

# Create data
Y <- cbind(Y1, Y2)
cor(Y)
# Does regular old probit work?
estimates <- coef(glm(Y[,2] ~ .  - 1, data = as.data.frame(X2), family = binomial(link = "probit")))

# let's look at the estimates

data.frame(gamma, estimates) %>%
  mutate(endo_regressor = 1:n()==1) %>%
  ggplot(aes(gamma, estimates, colour = endo_regressor)) +
  geom_point() +
  labs(title = "There's pretty clearly some bias")

# Pretty clearly there's a bias!


# Stan time ---------------------------------------------------------------


# Let's run Stan
data_list <- list(N = N, P = P, P2 = P2, X = X, Z = Z, Y = as.data.frame(Y))
compiled_model <- stan_model("biprobit.stan")
estimated_model <- sampling(compiled_model, data = data_list, iter = 500)

# Look at the estimates relative to the actuals (gamma[1] was biased)
estimated_model
gamma

# Let's look at the estimates

as.data.frame(estimated_model, pars = c("gamma","beta")) %>%
  melt() %>%
  group_by(variable) %>%
  summarise(median = median(value),
            lower = quantile(value, .025),
            upper = quantile(value, .975)) %>%
  mutate(true_values = c(gamma, beta),
         size = ifelse(1:n()==1, 3, 1)) %>%
  ggplot(aes(x = median)) +
  geom_linerange(aes(ymin = lower, ymax = upper), colour = "red") +
  geom_point(aes(y = median), colour = "red") +
  geom_point(aes(y = true_values, colour = size)) +
  labs(title = "Muuuuch better")
	library(rstan); library(dplyr); library(ggplot2); library(reshape2)
	options(mc.cores = parallel::detectCores())

	set.seed(49)
	# Observations
	N <- 2000

	# Draw errors
	rho <- 0.5
	Sigma <- matrix(c(1, rho, rho, 1), 2, 2)
	errors <- MASS::mvrnorm(N, c(0, 0), Sigma)

	# Draw some fake covariates
	P <- 5
	P2 <- 3
	X <- cbind(1, matrix(rnorm((P-1)*N), N, P-1))
	Z <- matrix(rnorm(P2*N), N, P2)

	# Some fake parameters
	beta <- rnorm(P + P2)
	gamma <- rnorm(P + 1)

	# Make Y
	Ystar1 <- cbind(X, Z) %*% beta + errors[,1]
	Y1 = as.numeric(Ystar1 > 0 )

	# Make covariates for second stage
	X2 <- cbind(Y1, X)

	# Make outcome
	Ystar2 <- X2 %*% gamma + errors[,2]
	Y2 <- as.numeric(Ystar2 > 0)

	# Create data
	Y <- cbind(Y1, Y2)
	cor(Y)
	# Does regular old probit work?
	estimates <- coef(glm(Y[,2] ~ . - 1, data = as.data.frame(X2), family = binomial(link = "probit")))

	# let's look at the estimates

	data.frame(gamma, estimates) %>%
	mutate(endo_regressor = 1:n()==1) %>%
	ggplot(aes(gamma, estimates, colour = endo_regressor)) +
	geom_point() +
	labs(title = "There's pretty clearly some bias")

	# Pretty clearly there's a bias!



	# Stan time ---------------------------------------------------------------


	# Let's run Stan
	data_list <- list(N = N, P = P, P2 = P2, X = X, Z = Z, Y = as.data.frame(Y))
	compiled_model <- stan_model("biprobit.stan")
	estimated_model <- sampling(compiled_model, data = data_list, iter = 500)

	# Look at the estimates relative to the actuals (gamma[1] was biased)
	estimated_model
	gamma

	# Let's look at the estimates

	as.data.frame(estimated_model, pars = c("gamma","beta")) %>%
	melt() %>%
	group_by(variable) %>%
	summarise(median = median(value),
	lower = quantile(value, .025),
	upper = quantile(value, .975)) %>%
	mutate(true_values = c(gamma, beta),
	size = ifelse(1:n()==1, 3, 1)) %>%
	ggplot(aes(x = median)) +
	geom_linerange(aes(ymin = lower, ymax = upper), colour = "red") +
	geom_point(aes(y = median), colour = "red") +
	geom_point(aes(y = true_values, colour = size)) +
	labs(title = "Muuuuch better")