diamonaj/causalinf

## causalinf
PEACEKEEPING WORKOUT (based on  King, Gary;Zeng, Langche, 2007,
                      "Replication data for: When Can History be Our Guide?
                      The Pitfalls of Counterfactual Inference",
                      https://hdl.handle.net/1902.1/DXRXCFAWPK,
                      Harvard Dataverse, V4,
                      UNF:3:DaYlT6QSX9r0D50ye+tXpA== [fileUNF] )
# CONSIDER USING THE JUPYTER NOTEBOOK WITH R-SERVER KERNEL (NEVER R-SAGE KERNEL)
foo <- read.csv("https://course-resources.minerva.kgi.edu/uploaded_files/mke/00086677-3767/peace.csv")

# extract relevant columns
foo <- foo[, c(6:8, 11:16, 99, 50, 114, 49, 63, 136, 109, 126, 48, 160, 142, 10)]

# remove 2 rows with missing data (there are better ways to handle missing data)
foo <- foo[c(-19, -47), ]

# check that all missing data is gone...
which(is.na(foo) == TRUE)

# take a peek at the data set (identify the columns)
head(foo)

### Outcome is "pbs2s3": "democracy" and "peace" within 2 years after the end of the war
### codebook is here: http://www.nyu.edu/gsas/dept/politics/faculty/cohen/codebook.pdf

### Treatment indicator is "untype4": "multidimensional peacekeeping/peacebuilding"

### How many treated units? How many controls? How do you feel about SUTVA?


# STEP 1:
# Take the logistic regression (the ‘original model’) that appears in Gary King's
# paper with Langche Zheng--it's available here:

# https://dataverse.harvard.edu/dataset.xhtml?persistentId=hdl:1902.1/DXRXCFAWPK
# go to PAGE 2... download/open "replication.do" ...

# Find the logistic regression model near the top, on the line starting with the word
# "logistic". Let this be your propensity score model, ignoring references to "cluster"
# (i.e., the model ends with 'decade'.)

# Call your propensity score model "glm1".
# e.g, glm1 <- glm(untype4 ~ wartype + logcost + ... , data = foo, family = binomial)

# TAKE NOTE OF:
# (a) the treatment effect (ATT), and the p-value of the treatment effect;
# (b) which conflicts in the treatment group get matched to which 'control' conflicts
#     (using $mdata OR $index.treated and $index.control)
# (c) the lowest p-value obtained from balance tests, after running MatchBalance on:
#     wartype, logcost, wardur, factnum, factnum2, trnsfcap, untype4, treaty,
#     develop, exp, decade...
#     SHORTCUT! COPY/PASTE THE BELOW (after matching, filling in the blank):
mb  <- MatchBalance(pbs2s3 ~  wartype + logcost + wardur + factnum +
                              factnum2 + trnsfcap + untype4 +
                              treaty + develop + exp + decade,
                              data=foo, match.out = FILL_IN_BLANK, nboots=500)


# STEP 2:
# Perform genetic matching, with X = to all the variables included in MatchBalance
# pop.size should be at least 200, max.generations should be at least 20,
# and wait generations should be at least 10. Estimand is "ATT".

# Take note of the same 3 results as above. Check the rgenoud output
# to see if the genetic algorithm improves fitness over time.


# STEP 3:
# Augment your STEP 2 analysis by including your propensity score in "X"...
# e.g., X <- cbind(X, glm1$fitted)
# Take note of the same 3 results. See if your balance improves with inclusion
# of the propensity score as a matching variable.  See if your impact
# estimate changes. See if you can improve your results (e.g., balance)
# by changing the matching variables, or the matching parameters (e.g., "M")
	PEACEKEEPING WORKOUT (based on King, Gary;Zeng, Langche, 2007,
	"Replication data for: When Can History be Our Guide?
	The Pitfalls of Counterfactual Inference",
	https://hdl.handle.net/1902.1/DXRXCFAWPK,
	Harvard Dataverse, V4,
	UNF:3:DaYlT6QSX9r0D50ye+tXpA== [fileUNF] )
	# CONSIDER USING THE JUPYTER NOTEBOOK WITH R-SERVER KERNEL (NEVER R-SAGE KERNEL)
	foo <- read.csv("https://course-resources.minerva.kgi.edu/uploaded_files/mke/00086677-3767/peace.csv")

	# extract relevant columns
	foo <- foo[, c(6:8, 11:16, 99, 50, 114, 49, 63, 136, 109, 126, 48, 160, 142, 10)]

	# remove 2 rows with missing data (there are better ways to handle missing data)
	foo <- foo[c(-19, -47), ]

	# check that all missing data is gone...
	which(is.na(foo) == TRUE)

	# take a peek at the data set (identify the columns)
	head(foo)

	### Outcome is "pbs2s3": "democracy" and "peace" within 2 years after the end of the war
	### codebook is here: http://www.nyu.edu/gsas/dept/politics/faculty/cohen/codebook.pdf

	### Treatment indicator is "untype4": "multidimensional peacekeeping/peacebuilding"

	### How many treated units? How many controls? How do you feel about SUTVA?



	# STEP 1:
	# Take the logistic regression (the ‘original model’) that appears in Gary King's
	# paper with Langche Zheng--it's available here:

	# https://dataverse.harvard.edu/dataset.xhtml?persistentId=hdl:1902.1/DXRXCFAWPK
	# go to PAGE 2... download/open "replication.do" ...

	# Find the logistic regression model near the top, on the line starting with the word
	# "logistic". Let this be your propensity score model, ignoring references to "cluster"
	# (i.e., the model ends with 'decade'.)

	# Call your propensity score model "glm1".
	# e.g, glm1 <- glm(untype4 ~ wartype + logcost + ... , data = foo, family = binomial)

	# TAKE NOTE OF:
	# (a) the treatment effect (ATT), and the p-value of the treatment effect;
	# (b) which conflicts in the treatment group get matched to which 'control' conflicts
	# (using $mdata OR $index.treated and $index.control)
	# (c) the lowest p-value obtained from balance tests, after running MatchBalance on:
	# wartype, logcost, wardur, factnum, factnum2, trnsfcap, untype4, treaty,
	# develop, exp, decade...
	# SHORTCUT! COPY/PASTE THE BELOW (after matching, filling in the blank):
	mb <- MatchBalance(pbs2s3 ~ wartype + logcost + wardur + factnum +
	factnum2 + trnsfcap + untype4 +
	treaty + develop + exp + decade,
	data=foo, match.out = FILL_IN_BLANK, nboots=500)




	# STEP 2:
	# Perform genetic matching, with X = to all the variables included in MatchBalance
	# pop.size should be at least 200, max.generations should be at least 20,
	# and wait generations should be at least 10. Estimand is "ATT".

	# Take note of the same 3 results as above. Check the rgenoud output
	# to see if the genetic algorithm improves fitness over time.




	# STEP 3:
	# Augment your STEP 2 analysis by including your propensity score in "X"...
	# e.g., X <- cbind(X, glm1$fitted)
	# Take note of the same 3 results. See if your balance improves with inclusion
	# of the propensity score as a matching variable. See if your impact
	# estimate changes. See if you can improve your results (e.g., balance)
	# by changing the matching variables, or the matching parameters (e.g., "M")