Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active January 18, 2019 01:41
Show Gist options
  • Save BioSciEconomist/d97a720c8b58fded158b9cb51acb5a70 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/d97a720c8b58fded158b9cb51acb5a70 to your computer and use it in GitHub Desktop.
demo of R matchit based on: Matching as Nonparametric Preprocessing for Reducing Model Dependence in Parametric Causal Inference by Ho,Imai,King,and Stuart Political Analysis (2007) 15:199-236
# *-----------------------------------------------------------------
# | PROGRAM NAME: MatchIt intro.R
# | DATE: 1/12/19
# | CREATED BY: MATT BOGARD
# | PROJECT FILE: /Google Drive/R Training
# *----------------------------------------------------------------
# | PURPOSE: demo of R matchit based on: Matching as Nonparametric Preprocessing for
# | Reducing Model Dependence in Parametric Causal Inference by Ho,Imai,King,and Stuart
# | Political Analysis (2007) 15:199-236
# *----------------------------------------------------------------
library(dplyr)
library(lubridate)
rm(list=ls())
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation
library(MatchIt)
library(dplyr)
#-----------------------------
# data loading and customization
#-----------------------------
data("lalonde") # load data
head(lalonde)
df <- lalonde # resave data frame
# export to csv for other work
# write.csv(lalonde, file = "/Users/amandabogard/Google Drive/R Training/lalonde.csv", row.names = FALSE)
# create row ID in case we need this
df$RowID <- 1:nrow(df)
# convert index to column
mbrid <- rownames(df)
df <- cbind(mbrid,df)
df$mbrid <- as.character(df$mbrid) # character conversion
# create an index date for the treatment group for demo purposes
SVC_FROM_DT <- seq(as.Date("2000/1/1"), by="month", length=614)
df <-cbind(df,SVC_FROM_DT)
# set date to dummy date for controls
df$SVC_FROM_DT <- ifelse(df$treat==1,df$SVC_FROM_DT,as.Date("2020-01-01"))
df$SVC_FROM_DT <- as.Date(df$SVC_FROM_DT,origin = "1970-01-01") # fix formats
#-----------------------------------------
# explore data and check balance on covariates prior to matching
#----------------------------------------
summary(df$treat) # treatment vs controls
table(df$treat)
# using dplyr with binary target
df %>% group_by(treat) %>% # group by treatment
summarize(AVGre74 = mean(re74),
SDre74 = sd(re74),
AVGre75 = mean(re75),
SDre75 = sd(re75),
AVGage = mean(age),
SDage = sd(age),
AVGedu = mean(educ),
SDedu = sd(educ)
)
# distributional summaries
by(df$re74, df$treat,summary)
by(df$re74, df$treat,sd)
by(df$re75, df$treat,summary)
by(df$re75, df$treat,sd)
# histograms
par(mfrow=c(1,2))
hist(df$re74[df$treat==1])
hist(df$re74[df$treat==0])
dev.off() # reset par
## back to back
h1 = hist(df$re74[df$treat==1],plot=FALSE)
h2 = hist(df$re74[df$treat==0],plot=FALSE)
h2$counts = - h2$counts
hmax = max(h1$counts)
hmin = min(h2$counts)
X = c(h1$breaks, h2$breaks)
xmax = max(X)
xmin = min(X)
plot(h1, ylim=c(hmin, hmax), col="green", xlim=c(xmin, xmax))
lines(h2, col="blue")
### overalapping densities by treatment and control groups
ds <- rbind(data.frame(dat=df[,][,"re75"], grp="All"),
data.frame(dat=df[,][df$treat==1,"re75"], grp="trt"),
data.frame(dat=df[,][df$treat==0,"re75"], grp="ctrl"))
# histogram for all data
hs <- hist(ds[ds$grp=="All",1], main="", xlab="re75", col="grey90", breaks="fd", border=TRUE)
# compute density, rescale, and plot for all data
# dens <- density(ds[ds$grp=="All",1], na.rm=TRUE)
# rs <- max(hs$counts)/max(dens$y)
# lines(dens$x, dens$y*rs, type="l", col="black")
# compute density, rescale, and plot for treatment
dens <- density(ds[ds$grp=="trt",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col="red")
# compute density, rescale, and plot for control
dens <- density(ds[ds$grp=="ctrl",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col="green")
# Add a legend to the plot.
legend("topright", c("All", "trt", "ctrl"), bty="n", fill=c("grey","red","green"))
dev.off()
# Grouped Bar Plot by target (iteration)
xvar <- df$nodegree # black hispan married nodegree
lbl <- "nodegree"
counts <- table(df$treat, xvar)
barplot(counts, main="Categorical Balance by Treatment",
xlab= lbl, col=c("darkblue","red"),
legend = rownames(counts), beside=TRUE)
# what is the raw difference in outcome re78 between treatment and controls
df%>%
group_by(treat) %>% # group by treatment
summarize(AVGre78 = mean(re78))
# treatment group appears to have lower income
# is this difference significant
t.test(df$re78[df$treat==1],df$re78[df$treat==0],paired=FALSE)
summary(lm(re78 ~ treat, data = df))
# difference with controls
summary(lm(re78 ~ treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df))
# controlled comparisons using regression indicate a postive treatment effect
# on the order of $1548
#----------------------------------------------
# match on propensity scores
#---------------------------------------------
# reference model
summary(glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, family = "binomial", data = df))
# match data using nearest neighbor 1:1 greedy matching using logistic regression and caliper = .10 sd(ps)
set.seed(1234)
m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=1,
distance= "logit", caliper=0.10, replace= F)
# match data using nearest neighbor 2:1 greedy matching using logistic regression and caliper = .10 sd(ps)
set.seed(1234)
m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=2,
distance= "logit", caliper=0.10, replace= F)
# match using 1:1 matching and hard matching on binary categoricals
set.seed(1234)
m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df, method= "nearest",
ratio=1, distance= "logit", exact = c("black","hispan","married","nodegree"), caliper=0.10, replace= F)
summary(m.out) # check balance for each iteration above to determine which approach gives the most balance
# (notice the improvement of matches with caliper = .10 vs .25)
### Obtain matched analysis file
df1 <- match.data(m.out)
#---------------------------------------
# examining matched observations
#---------------------------------------
# although "matching does not require pairing observations"...."only the distributions
# need be matched as closely as possible" in some applications we want to pull pre and post period data for
# treatments and controls based on an index date representing the date of treatment for the treatment group.
# For control groups we typically 'assign' an index date based on the index date of the unit of observation
# they are matched to. In this case we need to identfy 'matched pairs' to get the index date
matchids <- data.frame(m.out$match.matrix) # extract matched pairs
# convert index to column to get treatment id and rename 1st column as the control id
trtid <- rownames(matchids)
matchids <- cbind(trtid,matchids)%>%rename(ctrlid = X1)
# format conversions
matchids$trtid <- as.character(matchids$trtid)
matchids$ctrlid <- as.character(matchids$ctrlid)
# get only treatment data from matched output data to form a treatment cohort file
trtchrt <- df1%>%filter(treat ==1)
trtchrt$trtid <- trtchrt$mbrid
# from matchids add the corresponding control id to the treatment cohort file
trtchrt <- trtchrt%>%left_join(select(matchids,ctrlid,trtid), by = "trtid")
# get only control data from matched output data to form a contrl cohort file with their ps and id
ctrlchrt <- df1%>%filter(treat ==0)%>%rename(psctrl = distance) # filter controls and rename ps variable
ctrlchrt$ctrlid <- ctrlchrt$mbrid # create ctrl id
# add the matched control's ps to the treatment cohort file joining on the ctrlid
trtchrt <- trtchrt%>%left_join(select(ctrlchrt,ctrlid,psctrl), by = "ctrlid")
# extract control ID and relevant date that we will use for the index date
# this date corresponds to the matched treatment member's program start date and
# will be used as the index date for the control member
matchindexdt <- trtchrt[c('SVC_FROM_DT','ctrlid')]%>%rename(ctrlindex = SVC_FROM_DT,mbrid =ctrlid)
# merge with matched analysis file
df1 <- df1%>%left_join(select(matchindexdt,mbrid,ctrlindex), by = "mbrid")
# create index date for matched analysis file
df1$index_dt <- ifelse(df1$treat,df1$SVC_FROM_DT,df1$ctrlindex)
df1$index_dt <- as.Date(df1$index_dt,origin = "1970-01-01")
# check a few examples to makes sure matched ids and dates line up across treatments and controls
df1[df1$mbrid %in% c('PSID337', 'PSID240', 'PSID256'),]%>%select(mbrid,index_dt)
matchids[matchids$ctrlid %in%c('PSID337', 'PSID240', 'PSID256'),]
df[df$mbrid %in% c('NSW1','NSW2','NSW3'),]
#----------------------------------------
# explore and check balance after matching
#----------------------------------------
# check balance
summary(m.out)
# check both means and sds
df1 %>% group_by(treat) %>% # group by treatment
summarize(AVGre74 = mean(re74),
SDre74 = sd(re74),
AVGre75 = mean(re75),
SDre75 = sd(re75),
AVGage = mean(age),
SDage = sd(age),
AVGedu = mean(educ),
SDedu = sd(educ)
)
# visualize balance diagnostics
plot(m.out) # default QQ plots
plot(m.out, type = "hist") # common support of matched groups
plot(m.out, type = "jitter") # scatter plots of propensity scores common support
# histograms
par(mfrow=c(1,2))
hist(df1$re74[df1$treat==1])
hist(df1$re74[df1$treat==0])
dev.off() # reset par
### overalapping densities by treatment and control groups
ds <- rbind(data.frame(dat=df1[,][,"re74"], grp="All"),
data.frame(dat=df1[,][df1$treat==1,"re75"], grp="trt"),
data.frame(dat=df1[,][df1$treat==0,"re75"], grp="ctrl"))
# histogram for all data
hs <- hist(ds[ds$grp=="All",1], main="", xlab="re74", col="grey90", breaks="fd", border=TRUE)
# compute density, rescale, and plot for treatment
dens <- density(ds[ds$grp=="trt",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col="red")
# compute density, rescale, and plot for control
dens <- density(ds[ds$grp=="ctrl",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col="green")
# Add a legend to the plot.
legend("topright", c("All", "trt", "ctrl"), bty="n", fill=c("grey","red","green"))
dev.off()
#-----------------------------------------
# estimating treatment effects
#----------------------------------------
# what is the difference between outcomes for matched data
df1%>%
group_by(treat) %>% # group by treatment
summarize(AVGre78 = mean(re78))
# treatment now has much higher value of outcome
# t-test
t.test(df1$re78[df1$treat==1],df1$re78[df1$treat==0],paired=FALSE)
# regression
summary(lm(re78 ~ treat, data = df1))
# regression with added controls
summary(lm(re78 ~ treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df1))
# regression adjustment does not remove much additional bias or improve significance
#---------------------------------------
# sensitivity analysis
#--------------------------------------
library(rbounds)
# the value of Gamma is interpreted as the odds of treatment assignment hidden bias.
# A change in the odds lower/upper bounds from significant to non-significant (or vice-versa)
# indicates by how much the odds need to change before the statistical significance of the outcome
# shifts. For example, in Figure 17, the lower bound estimate changes from non-significant (0.0541)
# to significant (0.0194) when gamma is 1.3. That is, a change of 0.3 in the odds will produce a
# change in the significance value. Rosenbaum (2002) defines a study as sensitive if values of
# Gamma close to 1 lead to changes in significance compared to those that could be obtained if the
# study is free of bias. Thus results will be more robust to hidden bias, if a very large change
# in the odds is needed before a change in statistical significance happens.
# Sensitivity Test Example https://www.rdocumentation.org/packages/rbounds/versions/2.1/topics/psens
trt <- c(38, 23, 41, 18, 37, 36, 23, 62, 31, 34, 24, 14, 21, 17, 16, 20,
15, 10, 45, 39, 22, 35, 49, 48, 44, 35, 43, 39, 34, 13, 73, 25,
27)
ctrl <- c(16, 18, 18, 24, 19, 11, 10, 15, 16, 18, 18, 13, 19, 10, 16,
16, 24, 13, 9, 14, 21, 19, 7, 18, 19, 12, 11, 22, 25, 16, 13,
11, 13)
psens(trt, ctrl)
# create trt and control groups sorted by propensity
trt <- df1%>%
filter(treat==1)%>%
arrange(distance)
ctrl <- df1%>%
filter(treat==0)%>%
arrange(distance)
trt <- as.vector(trt$re78)
ctrl <- as.vector(ctrl$re78)
psens(trt, ctrl,Gamma = 2, GammaInc = 0.1)
#-----------------------------------
# subclassification
#-----------------------------------
m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "subclass",
distance= "logit", subclass = 5)
summary(m.out)
# estimation of treatment effects
df3 <- match.data(m.out)
# Subclass-specific t-test for given subclass
t.test(re78~treat, data=df3, subset=subclass==1)
# Equivalent to subclass-specific t-test for all subclasses
summary(lm(re78 ~ subclass + treat + subclass*treat, data = df3))
# ATE estimate (combining subclass-specific estimate)
# define N = total number of people
N <- dim(df3)[1]
# Initialize vectors for subclass-specific effects (“sub.effect”), variances(“sub.var”),
# and sample size(“sub.N”)
sub.effect <- rep(NA, max(df3$subclass))
sub.var <- rep(NA, max(df3$subclass))
sub.N <- rep(NA, max(df3$subclass))
# Run linear regression model within each subclass
for(s in 1:max(df3$subclass)){
tmp <- lm(re78 ~ treat, data=df3, subset=subclass==s)
sub.effect[s] <- tmp$coef[2]
sub.var[s] <- summary(tmp)$coef[2,2]^2
sub.N[s] <- sum(df3$subclass==s)}
# Calculate overall ATE effect
ATE.effect <- sum((sub.N/N)*sub.effect)
ATE.stderror <- sqrt(sum((sub.N/N)^2*sub.var))
print(ATE.effect)
print(ATE.stderror)
#-----------------------------------
# regression based on trimmed ps
#------------------------------------
ps <- glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data = df, family = binomial)
summary(ps)
df4 <- df # copy analysis file
# Attach the predicted propensity score to the datafile
df4$pscore <- predict(ps, type = "response")
# trim propensity scores
summary(df4$pscore)
df4%>%
summarize(mean = mean(pscore,na.rm = TRUE),
min = min(pscore,na.rm = TRUE),
P10 = quantile(pscore, probs=0.10,na.rm = TRUE),
Q1=quantile(pscore, probs=0.25,na.rm = TRUE),
med = median(pscore,na.rm = TRUE),
Q3=quantile(pscore, probs=0.75,na.rm = TRUE),
P85=quantile(pscore, probs=0.85,na.rm = TRUE),
P90=quantile(pscore, probs=0.90,na.rm = TRUE),
P95=quantile(pscore, probs=0.95,na.rm = TRUE),
P99=quantile(pscore, probs=0.99,na.rm = TRUE))
hist(df4$pscore)
dev.off()
# restrict data to ps range .10 <= ps <= .90
df5 <- df4[df4$pscore >= .03 & df4$pscore <=.72,]
summary(df5$pscore)
hist(df5$pscore)
# regression with controls on propensity score screened data set
summary(lm(re78 ~ treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df5))
# quick balance checks
df5 %>% group_by(treat) %>% # group by treatment
summarize(AVGre74 = mean(re74),
AVGre75 = mean(re75),
AVGage = mean(age),
AVGedu = mean(educ),
AVGblack = mean(black),
AVGhisp = mean(hispan),
AVGmarr = mean(married),
AVGndeg = mean(nodegree)
)
# compare to unmathced data
df %>% group_by(treat) %>% # group by treatment
summarize(AVGre74 = mean(re74),
AVGre75 = mean(re75),
AVGage = mean(age),
AVGedu = mean(educ),
AVGblack = mean(black),
AVGhisp = mean(hispan),
AVGmarr = mean(married),
AVGndeg = mean(nodegree)
)
#-----------------------------------
# notes
#------------------------------------
# https://stats.stackexchange.com/questions/66132/understanding-output-of-matchit-in-r
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment