BioSciEconomist/MatchIt intro.R

## MatchIt intro.R
# *-----------------------------------------------------------------
# | PROGRAM NAME: MatchIt intro.R
# | DATE: 1/12/19
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:  /Google Drive/R Training
# *----------------------------------------------------------------
# | PURPOSE: demo of R matchit based on: Matching as Nonparametric Preprocessing for
# | Reducing Model Dependence in Parametric Causal Inference by Ho,Imai,King,and Stuart
# | Political Analysis (2007) 15:199-236
# *----------------------------------------------------------------


library(dplyr)
library(lubridate)
rm(list=ls())

options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

library(MatchIt)
library(dplyr)

#-----------------------------
# data loading and customization
#-----------------------------

data("lalonde") # load data
head(lalonde)

df <- lalonde # resave data frame

# export to csv for other work
# write.csv(lalonde, file = "/Users/amandabogard/Google Drive/R Training/lalonde.csv", row.names = FALSE)

# create row ID in case we need this
df$RowID <- 1:nrow(df)

# convert index to column
mbrid <- rownames(df)
df <- cbind(mbrid,df)
df$mbrid <- as.character(df$mbrid) # character conversion

# create an index date for the treatment group for demo purposes
SVC_FROM_DT <- seq(as.Date("2000/1/1"), by="month", length=614)
df <-cbind(df,SVC_FROM_DT)

# set date to dummy date for controls
df$SVC_FROM_DT <- ifelse(df$treat==1,df$SVC_FROM_DT,as.Date("2020-01-01"))
df$SVC_FROM_DT <- as.Date(df$SVC_FROM_DT,origin = "1970-01-01") # fix formats

#-----------------------------------------
# explore data and check balance on covariates prior to matching
#----------------------------------------

summary(df$treat) # treatment vs controls
table(df$treat)

# using dplyr with binary target
df %>% group_by(treat) %>%     # group by treatment
  summarize(AVGre74 = mean(re74),
            SDre74 = sd(re74),
            AVGre75 = mean(re75),
            SDre75 = sd(re75),
            AVGage = mean(age),
            SDage = sd(age),
            AVGedu = mean(educ),
            SDedu = sd(educ)
  )

# distributional summaries
by(df$re74, df$treat,summary)
by(df$re74, df$treat,sd)
by(df$re75, df$treat,summary)
by(df$re75, df$treat,sd)

# histograms
par(mfrow=c(1,2))
hist(df$re74[df$treat==1])
hist(df$re74[df$treat==0])
dev.off() # reset par

## back to back
h1 = hist(df$re74[df$treat==1],plot=FALSE)
h2 = hist(df$re74[df$treat==0],plot=FALSE)
h2$counts = - h2$counts
hmax = max(h1$counts)
hmin = min(h2$counts)
X = c(h1$breaks, h2$breaks)
xmax = max(X)
xmin = min(X)
plot(h1, ylim=c(hmin, hmax), col="green", xlim=c(xmin, xmax))
lines(h2, col="blue")

### overalapping densities by treatment and control groups

ds <- rbind(data.frame(dat=df[,][,"re75"], grp="All"),
            data.frame(dat=df[,][df$treat==1,"re75"], grp="trt"),
            data.frame(dat=df[,][df$treat==0,"re75"], grp="ctrl"))

# histogram for all  data
hs <- hist(ds[ds$grp=="All",1], main="", xlab="re75", col="grey90", breaks="fd", border=TRUE)

# compute density, rescale, and plot for all data
# dens <- density(ds[ds$grp=="All",1], na.rm=TRUE)
# rs <- max(hs$counts)/max(dens$y)
# lines(dens$x, dens$y*rs, type="l", col="black")

# compute density, rescale, and plot for treatment
dens <- density(ds[ds$grp=="trt",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col="red")

# compute density, rescale, and plot for control
dens <- density(ds[ds$grp=="ctrl",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col="green")

# Add a legend to the plot.
legend("topright", c("All", "trt", "ctrl"), bty="n", fill=c("grey","red","green"))
dev.off()


# Grouped Bar Plot by target (iteration)
xvar <- df$nodegree # black hispan married nodegree
lbl <- "nodegree"
counts <- table(df$treat, xvar)
barplot(counts, main="Categorical Balance by Treatment",
        xlab= lbl, col=c("darkblue","red"),
        legend = rownames(counts), beside=TRUE)


# what is the raw difference in outcome re78 between treatment and controls
df%>%
  group_by(treat) %>%     # group by treatment
  summarize(AVGre78 = mean(re78))
# treatment group appears to have lower income

# is this difference significant
t.test(df$re78[df$treat==1],df$re78[df$treat==0],paired=FALSE)
summary(lm(re78 ~  treat, data = df))

# difference with controls
summary(lm(re78 ~  treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df))

# controlled comparisons using regression indicate a postive treatment effect
# on the order of $1548

#----------------------------------------------
# match on propensity scores
#---------------------------------------------

# reference model
summary(glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, family = "binomial", data = df))

# match data using nearest neighbor 1:1 greedy matching using logistic regression and caliper = .10 sd(ps)
set.seed(1234)
m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=1,
                 distance= "logit", caliper=0.10, replace= F)

# match data using nearest neighbor 2:1 greedy matching using logistic regression and caliper = .10 sd(ps)
set.seed(1234)
m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=2,
                 distance= "logit", caliper=0.10, replace= F)

# match using 1:1 matching and hard matching on binary categoricals
set.seed(1234)
m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df, method= "nearest",
                 ratio=1, distance= "logit", exact = c("black","hispan","married","nodegree"), caliper=0.10, replace= F)

summary(m.out) # check balance for each iteration above to determine which approach gives the most balance
               # (notice the improvement of matches with caliper = .10 vs .25)

### Obtain matched analysis file
df1 <- match.data(m.out)

#---------------------------------------
#  examining matched observations
#---------------------------------------

# although  "matching does not require pairing observations"...."only the distributions
# need be matched as closely as possible" in some applications we want to pull pre and post period data for
# treatments and controls based on an index date representing the date of treatment for the treatment group.
# For control groups we typically 'assign' an index date based on the index date of the unit of observation
# they are matched to. In this case we need to identfy 'matched pairs' to get the index date

matchids <- data.frame(m.out$match.matrix) # extract matched pairs

# convert index to column to get treatment id and rename 1st column as the control id
trtid <- rownames(matchids)
matchids <- cbind(trtid,matchids)%>%rename(ctrlid = X1)

# format conversions
matchids$trtid <- as.character(matchids$trtid)
matchids$ctrlid <- as.character(matchids$ctrlid)

# get only treatment data from matched output data to form a treatment cohort file
trtchrt <- df1%>%filter(treat ==1)
trtchrt$trtid <- trtchrt$mbrid

# from matchids add the corresponding control id to the treatment cohort file
trtchrt <- trtchrt%>%left_join(select(matchids,ctrlid,trtid), by = "trtid")

# get only control data from matched output data to form a contrl cohort file with their ps and id
ctrlchrt <- df1%>%filter(treat ==0)%>%rename(psctrl = distance) # filter controls and rename ps variable
ctrlchrt$ctrlid <- ctrlchrt$mbrid # create ctrl id

# add the matched control's ps to the treatment cohort file joining on the ctrlid
trtchrt <- trtchrt%>%left_join(select(ctrlchrt,ctrlid,psctrl), by = "ctrlid")

# extract control ID and relevant date that we will use for the index date
# this date corresponds to the matched treatment member's program start date and
# will be used as the index date for the control member

matchindexdt <- trtchrt[c('SVC_FROM_DT','ctrlid')]%>%rename(ctrlindex = SVC_FROM_DT,mbrid =ctrlid)

# merge with matched analysis file
df1 <- df1%>%left_join(select(matchindexdt,mbrid,ctrlindex), by = "mbrid")

# create index date for matched analysis file
df1$index_dt <- ifelse(df1$treat,df1$SVC_FROM_DT,df1$ctrlindex)
df1$index_dt <- as.Date(df1$index_dt,origin = "1970-01-01")

# check a few examples to makes sure matched ids and dates line up across treatments and controls
df1[df1$mbrid %in% c('PSID337', 'PSID240', 'PSID256'),]%>%select(mbrid,index_dt)
matchids[matchids$ctrlid %in%c('PSID337', 'PSID240', 'PSID256'),]
df[df$mbrid %in% c('NSW1','NSW2','NSW3'),]


#----------------------------------------
# explore and check balance after matching
#----------------------------------------

# check balance
summary(m.out)

# check both means and sds
df1 %>% group_by(treat) %>%     # group by treatment
  summarize(AVGre74 = mean(re74),
            SDre74 = sd(re74),
            AVGre75 = mean(re75),
            SDre75 = sd(re75),
            AVGage = mean(age),
            SDage = sd(age),
            AVGedu = mean(educ),
            SDedu = sd(educ)
  )

# visualize balance diagnostics
plot(m.out) # default QQ plots
plot(m.out, type = "hist") # common support of matched groups
plot(m.out, type = "jitter") # scatter plots of propensity scores common support

# histograms
par(mfrow=c(1,2))
hist(df1$re74[df1$treat==1])
hist(df1$re74[df1$treat==0])
dev.off() # reset par

### overalapping densities by treatment and control groups
ds <- rbind(data.frame(dat=df1[,][,"re74"], grp="All"),
            data.frame(dat=df1[,][df1$treat==1,"re75"], grp="trt"),
            data.frame(dat=df1[,][df1$treat==0,"re75"], grp="ctrl"))

# histogram for all  data
hs <- hist(ds[ds$grp=="All",1], main="", xlab="re74", col="grey90", breaks="fd", border=TRUE)

# compute density, rescale, and plot for treatment
dens <- density(ds[ds$grp=="trt",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col="red")

# compute density, rescale, and plot for control
dens <- density(ds[ds$grp=="ctrl",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col="green")

# Add a legend to the plot.
legend("topright", c("All", "trt", "ctrl"), bty="n", fill=c("grey","red","green"))

dev.off()

#-----------------------------------------
# estimating treatment effects
#----------------------------------------

# what is the difference between outcomes for matched data
df1%>%
  group_by(treat) %>%     # group by treatment
  summarize(AVGre78 = mean(re78))
# treatment now has much higher value of outcome

# t-test
t.test(df1$re78[df1$treat==1],df1$re78[df1$treat==0],paired=FALSE)

# regression
summary(lm(re78 ~  treat, data = df1))

# regression with added controls
summary(lm(re78 ~  treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df1))

# regression adjustment does not remove much additional bias or improve significance


#---------------------------------------
# sensitivity analysis
#--------------------------------------

library(rbounds)

# the value of Gamma is interpreted as the odds of treatment assignment hidden bias.
# A change in the odds lower/upper bounds from significant to non-significant (or vice-versa)
# indicates by how much the odds need to change before the statistical significance of the outcome
# shifts. For example, in Figure 17, the lower bound estimate changes from non-significant (0.0541)
# to significant (0.0194) when gamma is 1.3. That is, a change of 0.3 in the odds will produce a
# change in the significance value. Rosenbaum (2002) defines a study as sensitive if values of
# Gamma close to 1 lead to changes in significance compared to those that could be obtained if the
# study is free of bias. Thus results will be more robust to hidden bias, if a very large change
# in the odds is needed before a change in statistical significance happens.

# Sensitivity Test Example https://www.rdocumentation.org/packages/rbounds/versions/2.1/topics/psens
trt <- c(38, 23, 41, 18, 37, 36, 23, 62, 31, 34, 24, 14, 21, 17, 16, 20,
         15, 10, 45, 39, 22, 35, 49, 48, 44, 35, 43, 39, 34, 13, 73, 25,
         27)
ctrl <- c(16, 18, 18, 24, 19, 11, 10, 15, 16, 18, 18, 13, 19, 10, 16,
          16, 24, 13, 9, 14, 21, 19, 7, 18, 19, 12, 11, 22, 25, 16, 13,
          11, 13)

psens(trt, ctrl)

# create trt and control groups sorted by propensity
trt <- df1%>%
  filter(treat==1)%>%
  arrange(distance)

ctrl <- df1%>%
  filter(treat==0)%>%
  arrange(distance)

trt <- as.vector(trt$re78)
ctrl <- as.vector(ctrl$re78)

psens(trt, ctrl,Gamma = 2, GammaInc = 0.1)


#-----------------------------------
# subclassification
#-----------------------------------

m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "subclass",
                 distance= "logit", subclass = 5)

summary(m.out)

# estimation of treatment effects
df3 <- match.data(m.out)

# Subclass-specific t-test for given subclass
t.test(re78~treat, data=df3, subset=subclass==1)

# Equivalent to subclass-specific t-test for all subclasses
summary(lm(re78 ~ subclass + treat + subclass*treat, data = df3))

# ATE estimate (combining subclass-specific estimate)
# define N = total number of people
N <- dim(df3)[1]

# Initialize vectors for subclass-specific effects (“sub.effect”), variances(“sub.var”),
# and sample size(“sub.N”)
sub.effect <- rep(NA, max(df3$subclass))
sub.var <- rep(NA, max(df3$subclass))
sub.N <- rep(NA, max(df3$subclass))


# Run linear regression model within each subclass
for(s in 1:max(df3$subclass)){
  tmp <- lm(re78 ~ treat, data=df3, subset=subclass==s)
  sub.effect[s] <- tmp$coef[2]
  sub.var[s] <- summary(tmp)$coef[2,2]^2
  sub.N[s] <- sum(df3$subclass==s)}

# Calculate overall ATE effect
ATE.effect <- sum((sub.N/N)*sub.effect)
ATE.stderror <- sqrt(sum((sub.N/N)^2*sub.var))

print(ATE.effect)
print(ATE.stderror)

#-----------------------------------
# regression based on trimmed ps
#------------------------------------

ps <- glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data = df, family = binomial)
summary(ps)

df4 <- df # copy analysis file

# Attach the predicted propensity score to the datafile
df4$pscore <- predict(ps, type = "response")

# trim propensity scores
summary(df4$pscore)

df4%>%
  summarize(mean = mean(pscore,na.rm = TRUE),
            min = min(pscore,na.rm = TRUE),
            P10 = quantile(pscore, probs=0.10,na.rm = TRUE),
            Q1=quantile(pscore, probs=0.25,na.rm = TRUE),
            med = median(pscore,na.rm = TRUE),
            Q3=quantile(pscore, probs=0.75,na.rm = TRUE),
            P85=quantile(pscore, probs=0.85,na.rm = TRUE),
            P90=quantile(pscore, probs=0.90,na.rm = TRUE),
            P95=quantile(pscore, probs=0.95,na.rm = TRUE),
            P99=quantile(pscore, probs=0.99,na.rm = TRUE))

hist(df4$pscore)
dev.off()

# restrict data to ps range .10 <= ps <= .90
df5 <- df4[df4$pscore >= .03 & df4$pscore <=.72,]
summary(df5$pscore)
hist(df5$pscore)


# regression with controls on propensity score screened data set
summary(lm(re78 ~  treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df5))


# quick balance checks
df5 %>% group_by(treat) %>%     # group by treatment
  summarize(AVGre74 = mean(re74),
            AVGre75 = mean(re75),
            AVGage = mean(age),
            AVGedu = mean(educ),
            AVGblack = mean(black),
            AVGhisp = mean(hispan),
            AVGmarr = mean(married),
            AVGndeg = mean(nodegree)

  )

# compare to unmathced data
df %>% group_by(treat) %>%     # group by treatment
  summarize(AVGre74 = mean(re74),
            AVGre75 = mean(re75),
            AVGage = mean(age),
            AVGedu = mean(educ),
            AVGblack = mean(black),
            AVGhisp = mean(hispan),
            AVGmarr = mean(married),
            AVGndeg = mean(nodegree)

  )
#-----------------------------------
# notes
#------------------------------------


# https://stats.stackexchange.com/questions/66132/understanding-output-of-matchit-in-r
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: MatchIt intro.R
	# \| DATE: 1/12/19
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE: /Google Drive/R Training
	# *----------------------------------------------------------------
	# \| PURPOSE: demo of R matchit based on: Matching as Nonparametric Preprocessing for
	# \| Reducing Model Dependence in Parametric Causal Inference by Ho,Imai,King,and Stuart
	# \| Political Analysis (2007) 15:199-236
	# *----------------------------------------------------------------


	library(dplyr)
	library(lubridate)
	rm(list=ls())

	options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

	library(MatchIt)
	library(dplyr)

	#-----------------------------
	# data loading and customization
	#-----------------------------

	data("lalonde") # load data
	head(lalonde)

	df <- lalonde # resave data frame

	# export to csv for other work
	# write.csv(lalonde, file = "/Users/amandabogard/Google Drive/R Training/lalonde.csv", row.names = FALSE)

	# create row ID in case we need this
	df$RowID <- 1:nrow(df)

	# convert index to column
	mbrid <- rownames(df)
	df <- cbind(mbrid,df)
	df$mbrid <- as.character(df$mbrid) # character conversion

	# create an index date for the treatment group for demo purposes
	SVC_FROM_DT <- seq(as.Date("2000/1/1"), by="month", length=614)
	df <-cbind(df,SVC_FROM_DT)

	# set date to dummy date for controls
	df$SVC_FROM_DT <- ifelse(df$treat==1,df$SVC_FROM_DT,as.Date("2020-01-01"))
	df$SVC_FROM_DT <- as.Date(df$SVC_FROM_DT,origin = "1970-01-01") # fix formats

	#-----------------------------------------
	# explore data and check balance on covariates prior to matching
	#----------------------------------------

	summary(df$treat) # treatment vs controls
	table(df$treat)

	# using dplyr with binary target
	df %>% group_by(treat) %>% # group by treatment
	summarize(AVGre74 = mean(re74),
	SDre74 = sd(re74),
	AVGre75 = mean(re75),
	SDre75 = sd(re75),
	AVGage = mean(age),
	SDage = sd(age),
	AVGedu = mean(educ),
	SDedu = sd(educ)
	)

	# distributional summaries
	by(df$re74, df$treat,summary)
	by(df$re74, df$treat,sd)
	by(df$re75, df$treat,summary)
	by(df$re75, df$treat,sd)

	# histograms
	par(mfrow=c(1,2))
	hist(df$re74[df$treat==1])
	hist(df$re74[df$treat==0])
	dev.off() # reset par

	## back to back
	h1 = hist(df$re74[df$treat==1],plot=FALSE)
	h2 = hist(df$re74[df$treat==0],plot=FALSE)
	h2$counts = - h2$counts
	hmax = max(h1$counts)
	hmin = min(h2$counts)
	X = c(h1$breaks, h2$breaks)
	xmax = max(X)
	xmin = min(X)
	plot(h1, ylim=c(hmin, hmax), col="green", xlim=c(xmin, xmax))
	lines(h2, col="blue")

	### overalapping densities by treatment and control groups

	ds <- rbind(data.frame(dat=df[,][,"re75"], grp="All"),
	data.frame(dat=df[,][df$treat==1,"re75"], grp="trt"),
	data.frame(dat=df[,][df$treat==0,"re75"], grp="ctrl"))

	# histogram for all data
	hs <- hist(ds[ds$grp=="All",1], main="", xlab="re75", col="grey90", breaks="fd", border=TRUE)

	# compute density, rescale, and plot for all data
	# dens <- density(ds[ds$grp=="All",1], na.rm=TRUE)
	# rs <- max(hs$counts)/max(dens$y)
	# lines(dens$x, dens$y*rs, type="l", col="black")

	# compute density, rescale, and plot for treatment
	dens <- density(ds[ds$grp=="trt",1], na.rm=TRUE)
	rs <- max(hs$counts)/max(dens$y)
	lines(dens$x, dens$y*rs, type="l", col="red")

	# compute density, rescale, and plot for control
	dens <- density(ds[ds$grp=="ctrl",1], na.rm=TRUE)
	rs <- max(hs$counts)/max(dens$y)
	lines(dens$x, dens$y*rs, type="l", col="green")

	# Add a legend to the plot.
	legend("topright", c("All", "trt", "ctrl"), bty="n", fill=c("grey","red","green"))
	dev.off()


	# Grouped Bar Plot by target (iteration)
	xvar <- df$nodegree # black hispan married nodegree
	lbl <- "nodegree"
	counts <- table(df$treat, xvar)
	barplot(counts, main="Categorical Balance by Treatment",
	xlab= lbl, col=c("darkblue","red"),
	legend = rownames(counts), beside=TRUE)


	# what is the raw difference in outcome re78 between treatment and controls
	df%>%
	group_by(treat) %>% # group by treatment
	summarize(AVGre78 = mean(re78))
	# treatment group appears to have lower income

	# is this difference significant
	t.test(df$re78[df$treat==1],df$re78[df$treat==0],paired=FALSE)
	summary(lm(re78 ~ treat, data = df))

	# difference with controls
	summary(lm(re78 ~ treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df))

	# controlled comparisons using regression indicate a postive treatment effect
	# on the order of $1548

	#----------------------------------------------
	# match on propensity scores
	#---------------------------------------------

	# reference model
	summary(glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, family = "binomial", data = df))

	# match data using nearest neighbor 1:1 greedy matching using logistic regression and caliper = .10 sd(ps)
	set.seed(1234)
	m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=1,
	distance= "logit", caliper=0.10, replace= F)

	# match data using nearest neighbor 2:1 greedy matching using logistic regression and caliper = .10 sd(ps)
	set.seed(1234)
	m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=2,
	distance= "logit", caliper=0.10, replace= F)

	# match using 1:1 matching and hard matching on binary categoricals
	set.seed(1234)
	m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df, method= "nearest",
	ratio=1, distance= "logit", exact = c("black","hispan","married","nodegree"), caliper=0.10, replace= F)

	summary(m.out) # check balance for each iteration above to determine which approach gives the most balance
	# (notice the improvement of matches with caliper = .10 vs .25)

	### Obtain matched analysis file
	df1 <- match.data(m.out)

	#---------------------------------------
	# examining matched observations
	#---------------------------------------

	# although "matching does not require pairing observations"...."only the distributions
	# need be matched as closely as possible" in some applications we want to pull pre and post period data for
	# treatments and controls based on an index date representing the date of treatment for the treatment group.
	# For control groups we typically 'assign' an index date based on the index date of the unit of observation
	# they are matched to. In this case we need to identfy 'matched pairs' to get the index date

	matchids <- data.frame(m.out$match.matrix) # extract matched pairs

	# convert index to column to get treatment id and rename 1st column as the control id
	trtid <- rownames(matchids)
	matchids <- cbind(trtid,matchids)%>%rename(ctrlid = X1)

	# format conversions
	matchids$trtid <- as.character(matchids$trtid)
	matchids$ctrlid <- as.character(matchids$ctrlid)

	# get only treatment data from matched output data to form a treatment cohort file
	trtchrt <- df1%>%filter(treat ==1)
	trtchrt$trtid <- trtchrt$mbrid

	# from matchids add the corresponding control id to the treatment cohort file
	trtchrt <- trtchrt%>%left_join(select(matchids,ctrlid,trtid), by = "trtid")

	# get only control data from matched output data to form a contrl cohort file with their ps and id
	ctrlchrt <- df1%>%filter(treat ==0)%>%rename(psctrl = distance) # filter controls and rename ps variable
	ctrlchrt$ctrlid <- ctrlchrt$mbrid # create ctrl id

	# add the matched control's ps to the treatment cohort file joining on the ctrlid
	trtchrt <- trtchrt%>%left_join(select(ctrlchrt,ctrlid,psctrl), by = "ctrlid")

	# extract control ID and relevant date that we will use for the index date
	# this date corresponds to the matched treatment member's program start date and
	# will be used as the index date for the control member

	matchindexdt <- trtchrt[c('SVC_FROM_DT','ctrlid')]%>%rename(ctrlindex = SVC_FROM_DT,mbrid =ctrlid)

	# merge with matched analysis file
	df1 <- df1%>%left_join(select(matchindexdt,mbrid,ctrlindex), by = "mbrid")

	# create index date for matched analysis file
	df1$index_dt <- ifelse(df1$treat,df1$SVC_FROM_DT,df1$ctrlindex)
	df1$index_dt <- as.Date(df1$index_dt,origin = "1970-01-01")

	# check a few examples to makes sure matched ids and dates line up across treatments and controls
	df1[df1$mbrid %in% c('PSID337', 'PSID240', 'PSID256'),]%>%select(mbrid,index_dt)
	matchids[matchids$ctrlid %in%c('PSID337', 'PSID240', 'PSID256'),]
	df[df$mbrid %in% c('NSW1','NSW2','NSW3'),]


	#----------------------------------------
	# explore and check balance after matching
	#----------------------------------------

	# check balance
	summary(m.out)

	# check both means and sds
	df1 %>% group_by(treat) %>% # group by treatment
	summarize(AVGre74 = mean(re74),
	SDre74 = sd(re74),
	AVGre75 = mean(re75),
	SDre75 = sd(re75),
	AVGage = mean(age),
	SDage = sd(age),
	AVGedu = mean(educ),
	SDedu = sd(educ)
	)

	# visualize balance diagnostics
	plot(m.out) # default QQ plots
	plot(m.out, type = "hist") # common support of matched groups
	plot(m.out, type = "jitter") # scatter plots of propensity scores common support

	# histograms
	par(mfrow=c(1,2))
	hist(df1$re74[df1$treat==1])
	hist(df1$re74[df1$treat==0])
	dev.off() # reset par

	### overalapping densities by treatment and control groups
	ds <- rbind(data.frame(dat=df1[,][,"re74"], grp="All"),
	data.frame(dat=df1[,][df1$treat==1,"re75"], grp="trt"),
	data.frame(dat=df1[,][df1$treat==0,"re75"], grp="ctrl"))

	# histogram for all data
	hs <- hist(ds[ds$grp=="All",1], main="", xlab="re74", col="grey90", breaks="fd", border=TRUE)

	# compute density, rescale, and plot for treatment
	dens <- density(ds[ds$grp=="trt",1], na.rm=TRUE)
	rs <- max(hs$counts)/max(dens$y)
	lines(dens$x, dens$y*rs, type="l", col="red")

	# compute density, rescale, and plot for control
	dens <- density(ds[ds$grp=="ctrl",1], na.rm=TRUE)
	rs <- max(hs$counts)/max(dens$y)
	lines(dens$x, dens$y*rs, type="l", col="green")

	# Add a legend to the plot.
	legend("topright", c("All", "trt", "ctrl"), bty="n", fill=c("grey","red","green"))

	dev.off()

	#-----------------------------------------
	# estimating treatment effects
	#----------------------------------------

	# what is the difference between outcomes for matched data
	df1%>%
	group_by(treat) %>% # group by treatment
	summarize(AVGre78 = mean(re78))
	# treatment now has much higher value of outcome

	# t-test
	t.test(df1$re78[df1$treat==1],df1$re78[df1$treat==0],paired=FALSE)

	# regression
	summary(lm(re78 ~ treat, data = df1))

	# regression with added controls
	summary(lm(re78 ~ treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df1))

	# regression adjustment does not remove much additional bias or improve significance


	#---------------------------------------
	# sensitivity analysis
	#--------------------------------------

	library(rbounds)

	# the value of Gamma is interpreted as the odds of treatment assignment hidden bias.
	# A change in the odds lower/upper bounds from significant to non-significant (or vice-versa)
	# indicates by how much the odds need to change before the statistical significance of the outcome
	# shifts. For example, in Figure 17, the lower bound estimate changes from non-significant (0.0541)
	# to significant (0.0194) when gamma is 1.3. That is, a change of 0.3 in the odds will produce a
	# change in the significance value. Rosenbaum (2002) defines a study as sensitive if values of
	# Gamma close to 1 lead to changes in significance compared to those that could be obtained if the
	# study is free of bias. Thus results will be more robust to hidden bias, if a very large change
	# in the odds is needed before a change in statistical significance happens.

	# Sensitivity Test Example https://www.rdocumentation.org/packages/rbounds/versions/2.1/topics/psens
	trt <- c(38, 23, 41, 18, 37, 36, 23, 62, 31, 34, 24, 14, 21, 17, 16, 20,
	15, 10, 45, 39, 22, 35, 49, 48, 44, 35, 43, 39, 34, 13, 73, 25,
	27)
	ctrl <- c(16, 18, 18, 24, 19, 11, 10, 15, 16, 18, 18, 13, 19, 10, 16,
	16, 24, 13, 9, 14, 21, 19, 7, 18, 19, 12, 11, 22, 25, 16, 13,
	11, 13)

	psens(trt, ctrl)

	# create trt and control groups sorted by propensity
	trt <- df1%>%
	filter(treat==1)%>%
	arrange(distance)

	ctrl <- df1%>%
	filter(treat==0)%>%
	arrange(distance)

	trt <- as.vector(trt$re78)
	ctrl <- as.vector(ctrl$re78)

	psens(trt, ctrl,Gamma = 2, GammaInc = 0.1)



	#-----------------------------------
	# subclassification
	#-----------------------------------

	m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "subclass",
	distance= "logit", subclass = 5)

	summary(m.out)

	# estimation of treatment effects
	df3 <- match.data(m.out)

	# Subclass-specific t-test for given subclass
	t.test(re78~treat, data=df3, subset=subclass==1)

	# Equivalent to subclass-specific t-test for all subclasses
	summary(lm(re78 ~ subclass + treat + subclass*treat, data = df3))

	# ATE estimate (combining subclass-specific estimate)
	# define N = total number of people
	N <- dim(df3)[1]

	# Initialize vectors for subclass-specific effects (“sub.effect”), variances(“sub.var”),
	# and sample size(“sub.N”)
	sub.effect <- rep(NA, max(df3$subclass))
	sub.var <- rep(NA, max(df3$subclass))
	sub.N <- rep(NA, max(df3$subclass))


	# Run linear regression model within each subclass
	for(s in 1:max(df3$subclass)){
	tmp <- lm(re78 ~ treat, data=df3, subset=subclass==s)
	sub.effect[s] <- tmp$coef[2]
	sub.var[s] <- summary(tmp)$coef[2,2]^2
	sub.N[s] <- sum(df3$subclass==s)}

	# Calculate overall ATE effect
	ATE.effect <- sum((sub.N/N)*sub.effect)
	ATE.stderror <- sqrt(sum((sub.N/N)^2*sub.var))

	print(ATE.effect)
	print(ATE.stderror)

	#-----------------------------------
	# regression based on trimmed ps
	#------------------------------------

	ps <- glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data = df, family = binomial)
	summary(ps)

	df4 <- df # copy analysis file

	# Attach the predicted propensity score to the datafile
	df4$pscore <- predict(ps, type = "response")

	# trim propensity scores
	summary(df4$pscore)

	df4%>%
	summarize(mean = mean(pscore,na.rm = TRUE),
	min = min(pscore,na.rm = TRUE),
	P10 = quantile(pscore, probs=0.10,na.rm = TRUE),
	Q1=quantile(pscore, probs=0.25,na.rm = TRUE),
	med = median(pscore,na.rm = TRUE),
	Q3=quantile(pscore, probs=0.75,na.rm = TRUE),
	P85=quantile(pscore, probs=0.85,na.rm = TRUE),
	P90=quantile(pscore, probs=0.90,na.rm = TRUE),
	P95=quantile(pscore, probs=0.95,na.rm = TRUE),
	P99=quantile(pscore, probs=0.99,na.rm = TRUE))

	hist(df4$pscore)
	dev.off()

	# restrict data to ps range .10 <= ps <= .90
	df5 <- df4[df4$pscore >= .03 & df4$pscore <=.72,]
	summary(df5$pscore)
	hist(df5$pscore)


	# regression with controls on propensity score screened data set
	summary(lm(re78 ~ treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df5))


	# quick balance checks
	df5 %>% group_by(treat) %>% # group by treatment
	summarize(AVGre74 = mean(re74),
	AVGre75 = mean(re75),
	AVGage = mean(age),
	AVGedu = mean(educ),
	AVGblack = mean(black),
	AVGhisp = mean(hispan),
	AVGmarr = mean(married),
	AVGndeg = mean(nodegree)

	)

	# compare to unmathced data
	df %>% group_by(treat) %>% # group by treatment
	summarize(AVGre74 = mean(re74),
	AVGre75 = mean(re75),
	AVGage = mean(age),
	AVGedu = mean(educ),
	AVGblack = mean(black),
	AVGhisp = mean(hispan),
	AVGmarr = mean(married),
	AVGndeg = mean(nodegree)

	)
	#-----------------------------------
	# notes
	#------------------------------------


	# https://stats.stackexchange.com/questions/66132/understanding-output-of-matchit-in-r