BioSciEconomist/ex LATE and IV.R

## ex LATE and IV.R
# *-----------------------------------------------------------------
# | PROGRAM NAME: ex LATE and IV.R
# | DATE: 4/18/19
# | CREATED BY: MATT BOGARD
# | PROJECT FILE: /Google Drive/R Training
# *----------------------------------------------------------------
# | PURPOSE: demonstrate how an IV captures the treatment effect of the compliers, or LATE
# *----------------------------------------------------------------

# see companion blog post: https://econometricsense.blogspot.com/2019/04/intent-to-treat-instrumental-variables.html

# this is largely based on: http://egap.org/methods-guides/10-things-you-need-know-about-local-average-treatment-effect

# additional references:

# 1) Angrist, Joshua D., et al. “Identification of Causal Effects Using Instrumental Variables.”
#    Journal of the American Statistical Association, vol. 91, no. 434, 1996, pp. 444–455. JSTOR, www.jstor.org/stable/2291629.
# 2) Angrist, J.D. J Exp Criminol (2006) 2: 23. https://doi.org/10.1007/s11292-005-5126-x

# see also: https://theincidentaleconomist.com/wordpress/instrumental-variable-corrected-randomized-trial/


library(dplyr) # data wrangling and aggregations
library(AER)  # for IV estimation

#-----------------------------------
# simulate data for LATE example
#-----------------------------------

Z <- list() # this will be our random treatment assignment indicator - and our 'instrument'

# create 100 treatment cases
for(i in 1:100) {
  Z[i] = 1
}

# create 100 controls
for(i in 101:200) {
  Z[i] = 0
}


# initialize key variables
status <- list() # label for our compliers and never takers
y <- list() # outcome values
D <- list() # treatment recieved indicator - note the combination of treatment recieved (D) and treatment assigned
            # determines our 'compliers' and 'never takers'

# the first 20 of the treatment cases will be 'never takers' with an average outcome (y) value = 5
for(i in 1:20) {
  status[i] = 'never taker'
  y[i] = 5
  D[i] = 0 # never takers have a treatment recieved indicator = 0 regardless of treatment assignment (Z)
}

# the remaining treatment cases will be 'compliers' with an average outcome (y) value = 25
for(i in 21:100) {
  status[i] = 'complier'
  y[i] = 25
  D[i] = 1 # compliers have a treatment recieved indicator = 1 when treatment assigned Z = 1
}

# the first 20 of our controls will be 'never takers' with an average outcome (y) value = 5
for(i in 101:120) {
  status[i] = 'never taker'
  y[i] = 5
  D[i] = 0 # never takers have a treatment recieved indicator = 0 regardless of treatment assignment (Z)
}


# the remaining contrls will be 'compliers' with average outcome (y) value = 20
for(i in 121:200) {
  status[i] = 'complier'
  y[i] = 20
  D[i] = 0  # compliers have a treatment recieved indicator = 0 when treatment assigned Z = 0
}

# pack up lists generated above into a data frame
y <- as.numeric(as.character(unlist(y)))
Z <- as.numeric(as.character(unlist(Z)))
D <- as.numeric(as.character(unlist(D)))
status <- as.character(unlist(status))

df <- data.frame(y,Z,D,status)

#----------------------------------------
#  summarize our toy data
#----------------------------------------

table(status,Z) # we see that we have 40 never takers and 160 compliers

# break down outcomes by treatment assigned, treatment recieved and compliance status
df%>%
  group_by(status,D,Z)%>%
  summarize(Yavg = mean(y))

df%>%
  group_by(D)%>%
  summarize(Yavg = mean(y))

# from this data it is clear that the difference in outcomes from our compliers is 5 that is the LATE we will
# identify using instrumental variables estimation below, and compare that to our ITT and as treated analysis

#--------------------------------------
# analysis
#--------------------------------------

summary(lm(y~Z, data = df)) # ITT estimate   EST_B = 4

# compare those in treatment group that recieved treatment (compliers) to all controls
mean(df$y[df$Z ==1 & df$D ==1]) - mean(df$y[df$Z ==0]) # EST_B = 8

# compare 'as treated' to those not treated
summary(lm(y~D, data = df)) # EST_B = 10

# estimate LATE via 2SLS

D_star <- predict(lm(D~Z, data = df)) # 1st stage regression

lm(y~D_star, data = df) # 2nd stage regression EST_BIV = 5

# estimate local average treatment effecit or b_iv uisng Z or random treatment assignment as an instrumental variable using ivreg
summary(ivreg(y ~ D | Z,data =df))

# notice this is the same as the difference in means for the compliers in treatment and contorl groups (which is LATE)
mean(df$y[df$status == 'complier' & df$Z ==1]) - mean(df$y[df$status == 'complier' & df$Z ==0])
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: ex LATE and IV.R
	# \| DATE: 4/18/19
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE: /Google Drive/R Training
	# *----------------------------------------------------------------
	# \| PURPOSE: demonstrate how an IV captures the treatment effect of the compliers, or LATE
	# *----------------------------------------------------------------

	# see companion blog post: https://econometricsense.blogspot.com/2019/04/intent-to-treat-instrumental-variables.html

	# this is largely based on: http://egap.org/methods-guides/10-things-you-need-know-about-local-average-treatment-effect

	# additional references:

	# 1) Angrist, Joshua D., et al. “Identification of Causal Effects Using Instrumental Variables.”
	# Journal of the American Statistical Association, vol. 91, no. 434, 1996, pp. 444–455. JSTOR, www.jstor.org/stable/2291629.
	# 2) Angrist, J.D. J Exp Criminol (2006) 2: 23. https://doi.org/10.1007/s11292-005-5126-x

	# see also: https://theincidentaleconomist.com/wordpress/instrumental-variable-corrected-randomized-trial/


	library(dplyr) # data wrangling and aggregations
	library(AER) # for IV estimation

	#-----------------------------------
	# simulate data for LATE example
	#-----------------------------------

	Z <- list() # this will be our random treatment assignment indicator - and our 'instrument'

	# create 100 treatment cases
	for(i in 1:100) {
	Z[i] = 1
	}

	# create 100 controls
	for(i in 101:200) {
	Z[i] = 0
	}


	# initialize key variables
	status <- list() # label for our compliers and never takers
	y <- list() # outcome values
	D <- list() # treatment recieved indicator - note the combination of treatment recieved (D) and treatment assigned
	# determines our 'compliers' and 'never takers'

	# the first 20 of the treatment cases will be 'never takers' with an average outcome (y) value = 5
	for(i in 1:20) {
	status[i] = 'never taker'
	y[i] = 5
	D[i] = 0 # never takers have a treatment recieved indicator = 0 regardless of treatment assignment (Z)
	}

	# the remaining treatment cases will be 'compliers' with an average outcome (y) value = 25
	for(i in 21:100) {
	status[i] = 'complier'
	y[i] = 25
	D[i] = 1 # compliers have a treatment recieved indicator = 1 when treatment assigned Z = 1
	}

	# the first 20 of our controls will be 'never takers' with an average outcome (y) value = 5
	for(i in 101:120) {
	status[i] = 'never taker'
	y[i] = 5
	D[i] = 0 # never takers have a treatment recieved indicator = 0 regardless of treatment assignment (Z)
	}


	# the remaining contrls will be 'compliers' with average outcome (y) value = 20
	for(i in 121:200) {
	status[i] = 'complier'
	y[i] = 20
	D[i] = 0 # compliers have a treatment recieved indicator = 0 when treatment assigned Z = 0
	}

	# pack up lists generated above into a data frame
	y <- as.numeric(as.character(unlist(y)))
	Z <- as.numeric(as.character(unlist(Z)))
	D <- as.numeric(as.character(unlist(D)))
	status <- as.character(unlist(status))

	df <- data.frame(y,Z,D,status)

	#----------------------------------------
	# summarize our toy data
	#----------------------------------------

	table(status,Z) # we see that we have 40 never takers and 160 compliers

	# break down outcomes by treatment assigned, treatment recieved and compliance status
	df%>%
	group_by(status,D,Z)%>%
	summarize(Yavg = mean(y))

	df%>%
	group_by(D)%>%
	summarize(Yavg = mean(y))

	# from this data it is clear that the difference in outcomes from our compliers is 5 that is the LATE we will
	# identify using instrumental variables estimation below, and compare that to our ITT and as treated analysis

	#--------------------------------------
	# analysis
	#--------------------------------------

	summary(lm(y~Z, data = df)) # ITT estimate EST_B = 4

	# compare those in treatment group that recieved treatment (compliers) to all controls
	mean(df$y[df$Z ==1 & df$D ==1]) - mean(df$y[df$Z ==0]) # EST_B = 8

	# compare 'as treated' to those not treated
	summary(lm(y~D, data = df)) # EST_B = 10

	# estimate LATE via 2SLS

	D_star <- predict(lm(D~Z, data = df)) # 1st stage regression

	lm(y~D_star, data = df) # 2nd stage regression EST_BIV = 5

	# estimate local average treatment effecit or b_iv uisng Z or random treatment assignment as an instrumental variable using ivreg
	summary(ivreg(y ~ D \| Z,data =df))

	# notice this is the same as the difference in means for the compliers in treatment and contorl groups (which is LATE)
	mean(df$y[df$status == 'complier' & df$Z ==1]) - mean(df$y[df$status == 'complier' & df$Z ==0])