BioSciEconomist/ex toy synthetic controls.R

## ex toy synthetic controls.R
# *-----------------------------------------------------------------
# | PROGRAM NAME: ex toy synthetic controls.R
# | DATE: 4/23/19
# | CREATED BY: MATT BOGARD
# | PROJECT FILE: Macintosh HD ▸ ⁨Users⁩ ▸ ⁨amandabogard⁩ ▸ ⁨Google Dive⁩ ▸ ⁨R Training⁩
# *----------------------------------------------------------------
# | PURPOSE: intuition for how sytnthetic control methods work
# *----------------------------------------------------------------

library(Synth) # load Synth package

options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

df = read.csv("/Users/amandabogard/Google Drive/R Training/toysynth.txt") # read data
head(df)

df$state = as.character(df$state) # Synth requires our unit name variable to be character format

# run Synth's data prep function
dataprep.out=
  dataprep(
    foo = df,
    predictors = c("X1", "X2", "X3"), # predictors we are using to create controls units
    predictors.op = "mean",  # predictors will be aggregated via averaging
    dependent = "Y", # our outcome variable
    unit.variable = "ID", # this identifies the units of aggregation or level of analysis (state level)
    time.variable = "year", # our panel data is tracked by year in our file
    # special.predictors lets you control the time periods for certain predictor variables
    # i.e. we will use the pre-period outcome variable Y for years 1999, 1995, 1990
    # in addition to the predictors to create our synthetic control units
    special.predictors = list(
      list("Y", 1990:1995, "mean") , # use theese year values for Y in the pre period as a 'matching' variable
      list("X1", 1990:1995, "mean") ,  # use theese year values for X1 in the pre period as a 'matching' variable
      list("X2", 1990:1995, "mean"), # use theese year values for X2 in the pre period as a 'matching' variable
      list("X3", 1990:1995, "mean")  # use theese year values for X3 in the pre period as a 'matching' variable
    ),
    treatment.identifier = 1, # 7 indicates california is our treatment group
    controls.identifier = c(2,3,4), # these states are part of our control pool which will be weighted to create synthetic controls
    time.predictors.prior = c(1990:1995), # numeric vector identifying the pretreatment periods over which the values for the outcome predictors should be averaged
    time.optimize.ssr = c(1990:1995), # A numeric vector identifying the periods of the dependent variable over which the loss function should be minimized
    unit.names.variable = "state", #  character string identifying the column with the names of the units. This variable has to be of mode character.
    time.plot = 1990:1998 # vector identifying the periods over which results are to be plotted
  )

synth.out = synth(dataprep.out) # create synthetic controls


# plot treatment vs synthetic control outcomes trend
path.plot(synth.res = synth.out, dataprep.res = dataprep.out,
          Ylab = "made up outcome", Xlab = "year",
          Legend = c("KY","Synthetic KY"),
          Legend.position = "bottomright")


# gaps or differencesin treatment and synthetic control
gaps <- dataprep.out$Y1plot - (dataprep.out$Y0plot %*% synth.out$solution.w)

# pre built tables from synth objects
synth.tables <- synth.tab(dataprep.res = dataprep.out,synth.res = synth.out)

# comparing pre-treatment predictor values for the treated unit, the synthetic control unit, and all the units in the sample
synth.tables$tab.pred[1:5, ] # check balance across treated and control for pre-period predictors

# view control unit weights
synth.tables$tab.w

# toy data source read as csv

#ID,year,state,Y,X1,X2,X3
#1,1990,KY,.45,50000,25,10
#1,1991,KY,.45,51000,26,10
#1,1992,KY,.46,52000,27,10
#1,1993,KY,.48,52000,28,10
#1,1994,KY,.48,52000,28,10
#1,1995,KY,.48,53000,27,15
#1,1996,KY,.49,53000,24,15
#1,1997,KY,.50,54000,24,15
#1,1998,KY,.51,55000,23,15
#2,1990,TN,.45,52000,23,12
#2,1991,TN,.45,51000,23,12
#2,1992,TN,.44,53000,24,12
#2,1993,TN,.45,51000,26,12
#2,1994,TN,.44,52000,25,12
#2,1995,TN,.43,54000,26,14
#2,1996,TN,.42,54000,25,14
#2,1997,TN,.40,55000,26,14
#2,1998,TN,.41,56000,25,14
#3,1990,CA,.89,102000,10,20
#3,1991,CA,.90,102500,11,20
#3,1992,CA,.90,103000,13,20
#3,1993,CA,.92,103500,12,20
#3,1994,CA,.93,104000,11,20
#3,1995,CA,.93,104000,12,25
#3,1996,CA,.94,104500,14,25
#3,1997,CA,.94,105000,12,25
#3,1998,CA,.95,105000,10,25
#4,1990,IN,.43,52000,25,10
#4,1991,IN,.44,52000,26,10
#4,1992,IN,.42,53000,26,10
#4,1993,IN,.46,53500,27,10
#4,1994,IN,.45,53500,28,10
#4,1995,IN,.46,54000,26,12
#4,1996,IN,.47,54000,26,12
#4,1997,IN,.45,54500,25,12
#4,1998,IN,.46,55000,24,12
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: ex toy synthetic controls.R
	# \| DATE: 4/23/19
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE: Macintosh HD ▸ ⁨Users⁩ ▸ ⁨amandabogard⁩ ▸ ⁨Google Dive⁩ ▸ ⁨R Training⁩
	# *----------------------------------------------------------------
	# \| PURPOSE: intuition for how sytnthetic control methods work
	# *----------------------------------------------------------------

	library(Synth) # load Synth package

	options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

	df = read.csv("/Users/amandabogard/Google Drive/R Training/toysynth.txt") # read data
	head(df)

	df$state = as.character(df$state) # Synth requires our unit name variable to be character format

	# run Synth's data prep function
	dataprep.out=
	dataprep(
	foo = df,
	predictors = c("X1", "X2", "X3"), # predictors we are using to create controls units
	predictors.op = "mean", # predictors will be aggregated via averaging
	dependent = "Y", # our outcome variable
	unit.variable = "ID", # this identifies the units of aggregation or level of analysis (state level)
	time.variable = "year", # our panel data is tracked by year in our file
	# special.predictors lets you control the time periods for certain predictor variables
	# i.e. we will use the pre-period outcome variable Y for years 1999, 1995, 1990
	# in addition to the predictors to create our synthetic control units
	special.predictors = list(
	list("Y", 1990:1995, "mean") , # use theese year values for Y in the pre period as a 'matching' variable
	list("X1", 1990:1995, "mean") , # use theese year values for X1 in the pre period as a 'matching' variable
	list("X2", 1990:1995, "mean"), # use theese year values for X2 in the pre period as a 'matching' variable
	list("X3", 1990:1995, "mean") # use theese year values for X3 in the pre period as a 'matching' variable
	),
	treatment.identifier = 1, # 7 indicates california is our treatment group
	controls.identifier = c(2,3,4), # these states are part of our control pool which will be weighted to create synthetic controls
	time.predictors.prior = c(1990:1995), # numeric vector identifying the pretreatment periods over which the values for the outcome predictors should be averaged
	time.optimize.ssr = c(1990:1995), # A numeric vector identifying the periods of the dependent variable over which the loss function should be minimized
	unit.names.variable = "state", # character string identifying the column with the names of the units. This variable has to be of mode character.
	time.plot = 1990:1998 # vector identifying the periods over which results are to be plotted
	)

	synth.out = synth(dataprep.out) # create synthetic controls


	# plot treatment vs synthetic control outcomes trend
	path.plot(synth.res = synth.out, dataprep.res = dataprep.out,
	Ylab = "made up outcome", Xlab = "year",
	Legend = c("KY","Synthetic KY"),
	Legend.position = "bottomright")


	# gaps or differencesin treatment and synthetic control
	gaps <- dataprep.out$Y1plot - (dataprep.out$Y0plot %*% synth.out$solution.w)

	# pre built tables from synth objects
	synth.tables <- synth.tab(dataprep.res = dataprep.out,synth.res = synth.out)

	# comparing pre-treatment predictor values for the treated unit, the synthetic control unit, and all the units in the sample
	synth.tables$tab.pred[1:5, ] # check balance across treated and control for pre-period predictors

	# view control unit weights
	synth.tables$tab.w

	# toy data source read as csv

	#ID,year,state,Y,X1,X2,X3
	#1,1990,KY,.45,50000,25,10
	#1,1991,KY,.45,51000,26,10
	#1,1992,KY,.46,52000,27,10
	#1,1993,KY,.48,52000,28,10
	#1,1994,KY,.48,52000,28,10
	#1,1995,KY,.48,53000,27,15
	#1,1996,KY,.49,53000,24,15
	#1,1997,KY,.50,54000,24,15
	#1,1998,KY,.51,55000,23,15
	#2,1990,TN,.45,52000,23,12
	#2,1991,TN,.45,51000,23,12
	#2,1992,TN,.44,53000,24,12
	#2,1993,TN,.45,51000,26,12
	#2,1994,TN,.44,52000,25,12
	#2,1995,TN,.43,54000,26,14
	#2,1996,TN,.42,54000,25,14
	#2,1997,TN,.40,55000,26,14
	#2,1998,TN,.41,56000,25,14
	#3,1990,CA,.89,102000,10,20
	#3,1991,CA,.90,102500,11,20
	#3,1992,CA,.90,103000,13,20
	#3,1993,CA,.92,103500,12,20
	#3,1994,CA,.93,104000,11,20
	#3,1995,CA,.93,104000,12,25
	#3,1996,CA,.94,104500,14,25
	#3,1997,CA,.94,105000,12,25
	#3,1998,CA,.95,105000,10,25
	#4,1990,IN,.43,52000,25,10
	#4,1991,IN,.44,52000,26,10
	#4,1992,IN,.42,53000,26,10
	#4,1993,IN,.46,53500,27,10
	#4,1994,IN,.45,53500,28,10
	#4,1995,IN,.46,54000,26,12
	#4,1996,IN,.47,54000,26,12
	#4,1997,IN,.45,54500,25,12
	#4,1998,IN,.46,55000,24,12