Last active
January 18, 2019 01:41
-
-
Save BioSciEconomist/d97a720c8b58fded158b9cb51acb5a70 to your computer and use it in GitHub Desktop.
demo of R matchit based on: Matching as Nonparametric Preprocessing for Reducing Model Dependence in Parametric Causal Inference by Ho,Imai,King,and Stuart Political Analysis (2007) 15:199-236
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: MatchIt intro.R | |
# | DATE: 1/12/19 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: /Google Drive/R Training | |
# *---------------------------------------------------------------- | |
# | PURPOSE: demo of R matchit based on: Matching as Nonparametric Preprocessing for | |
# | Reducing Model Dependence in Parametric Causal Inference by Ho,Imai,King,and Stuart | |
# | Political Analysis (2007) 15:199-236 | |
# *---------------------------------------------------------------- | |
library(dplyr) | |
library(lubridate) | |
rm(list=ls()) | |
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation | |
library(MatchIt) | |
library(dplyr) | |
#----------------------------- | |
# data loading and customization | |
#----------------------------- | |
data("lalonde") # load data | |
head(lalonde) | |
df <- lalonde # resave data frame | |
# export to csv for other work | |
# write.csv(lalonde, file = "/Users/amandabogard/Google Drive/R Training/lalonde.csv", row.names = FALSE) | |
# create row ID in case we need this | |
df$RowID <- 1:nrow(df) | |
# convert index to column | |
mbrid <- rownames(df) | |
df <- cbind(mbrid,df) | |
df$mbrid <- as.character(df$mbrid) # character conversion | |
# create an index date for the treatment group for demo purposes | |
SVC_FROM_DT <- seq(as.Date("2000/1/1"), by="month", length=614) | |
df <-cbind(df,SVC_FROM_DT) | |
# set date to dummy date for controls | |
df$SVC_FROM_DT <- ifelse(df$treat==1,df$SVC_FROM_DT,as.Date("2020-01-01")) | |
df$SVC_FROM_DT <- as.Date(df$SVC_FROM_DT,origin = "1970-01-01") # fix formats | |
#----------------------------------------- | |
# explore data and check balance on covariates prior to matching | |
#---------------------------------------- | |
summary(df$treat) # treatment vs controls | |
table(df$treat) | |
# using dplyr with binary target | |
df %>% group_by(treat) %>% # group by treatment | |
summarize(AVGre74 = mean(re74), | |
SDre74 = sd(re74), | |
AVGre75 = mean(re75), | |
SDre75 = sd(re75), | |
AVGage = mean(age), | |
SDage = sd(age), | |
AVGedu = mean(educ), | |
SDedu = sd(educ) | |
) | |
# distributional summaries | |
by(df$re74, df$treat,summary) | |
by(df$re74, df$treat,sd) | |
by(df$re75, df$treat,summary) | |
by(df$re75, df$treat,sd) | |
# histograms | |
par(mfrow=c(1,2)) | |
hist(df$re74[df$treat==1]) | |
hist(df$re74[df$treat==0]) | |
dev.off() # reset par | |
## back to back | |
h1 = hist(df$re74[df$treat==1],plot=FALSE) | |
h2 = hist(df$re74[df$treat==0],plot=FALSE) | |
h2$counts = - h2$counts | |
hmax = max(h1$counts) | |
hmin = min(h2$counts) | |
X = c(h1$breaks, h2$breaks) | |
xmax = max(X) | |
xmin = min(X) | |
plot(h1, ylim=c(hmin, hmax), col="green", xlim=c(xmin, xmax)) | |
lines(h2, col="blue") | |
### overalapping densities by treatment and control groups | |
ds <- rbind(data.frame(dat=df[,][,"re75"], grp="All"), | |
data.frame(dat=df[,][df$treat==1,"re75"], grp="trt"), | |
data.frame(dat=df[,][df$treat==0,"re75"], grp="ctrl")) | |
# histogram for all data | |
hs <- hist(ds[ds$grp=="All",1], main="", xlab="re75", col="grey90", breaks="fd", border=TRUE) | |
# compute density, rescale, and plot for all data | |
# dens <- density(ds[ds$grp=="All",1], na.rm=TRUE) | |
# rs <- max(hs$counts)/max(dens$y) | |
# lines(dens$x, dens$y*rs, type="l", col="black") | |
# compute density, rescale, and plot for treatment | |
dens <- density(ds[ds$grp=="trt",1], na.rm=TRUE) | |
rs <- max(hs$counts)/max(dens$y) | |
lines(dens$x, dens$y*rs, type="l", col="red") | |
# compute density, rescale, and plot for control | |
dens <- density(ds[ds$grp=="ctrl",1], na.rm=TRUE) | |
rs <- max(hs$counts)/max(dens$y) | |
lines(dens$x, dens$y*rs, type="l", col="green") | |
# Add a legend to the plot. | |
legend("topright", c("All", "trt", "ctrl"), bty="n", fill=c("grey","red","green")) | |
dev.off() | |
# Grouped Bar Plot by target (iteration) | |
xvar <- df$nodegree # black hispan married nodegree | |
lbl <- "nodegree" | |
counts <- table(df$treat, xvar) | |
barplot(counts, main="Categorical Balance by Treatment", | |
xlab= lbl, col=c("darkblue","red"), | |
legend = rownames(counts), beside=TRUE) | |
# what is the raw difference in outcome re78 between treatment and controls | |
df%>% | |
group_by(treat) %>% # group by treatment | |
summarize(AVGre78 = mean(re78)) | |
# treatment group appears to have lower income | |
# is this difference significant | |
t.test(df$re78[df$treat==1],df$re78[df$treat==0],paired=FALSE) | |
summary(lm(re78 ~ treat, data = df)) | |
# difference with controls | |
summary(lm(re78 ~ treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df)) | |
# controlled comparisons using regression indicate a postive treatment effect | |
# on the order of $1548 | |
#---------------------------------------------- | |
# match on propensity scores | |
#--------------------------------------------- | |
# reference model | |
summary(glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, family = "binomial", data = df)) | |
# match data using nearest neighbor 1:1 greedy matching using logistic regression and caliper = .10 sd(ps) | |
set.seed(1234) | |
m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=1, | |
distance= "logit", caliper=0.10, replace= F) | |
# match data using nearest neighbor 2:1 greedy matching using logistic regression and caliper = .10 sd(ps) | |
set.seed(1234) | |
m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=2, | |
distance= "logit", caliper=0.10, replace= F) | |
# match using 1:1 matching and hard matching on binary categoricals | |
set.seed(1234) | |
m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df, method= "nearest", | |
ratio=1, distance= "logit", exact = c("black","hispan","married","nodegree"), caliper=0.10, replace= F) | |
summary(m.out) # check balance for each iteration above to determine which approach gives the most balance | |
# (notice the improvement of matches with caliper = .10 vs .25) | |
### Obtain matched analysis file | |
df1 <- match.data(m.out) | |
#--------------------------------------- | |
# examining matched observations | |
#--------------------------------------- | |
# although "matching does not require pairing observations"...."only the distributions | |
# need be matched as closely as possible" in some applications we want to pull pre and post period data for | |
# treatments and controls based on an index date representing the date of treatment for the treatment group. | |
# For control groups we typically 'assign' an index date based on the index date of the unit of observation | |
# they are matched to. In this case we need to identfy 'matched pairs' to get the index date | |
matchids <- data.frame(m.out$match.matrix) # extract matched pairs | |
# convert index to column to get treatment id and rename 1st column as the control id | |
trtid <- rownames(matchids) | |
matchids <- cbind(trtid,matchids)%>%rename(ctrlid = X1) | |
# format conversions | |
matchids$trtid <- as.character(matchids$trtid) | |
matchids$ctrlid <- as.character(matchids$ctrlid) | |
# get only treatment data from matched output data to form a treatment cohort file | |
trtchrt <- df1%>%filter(treat ==1) | |
trtchrt$trtid <- trtchrt$mbrid | |
# from matchids add the corresponding control id to the treatment cohort file | |
trtchrt <- trtchrt%>%left_join(select(matchids,ctrlid,trtid), by = "trtid") | |
# get only control data from matched output data to form a contrl cohort file with their ps and id | |
ctrlchrt <- df1%>%filter(treat ==0)%>%rename(psctrl = distance) # filter controls and rename ps variable | |
ctrlchrt$ctrlid <- ctrlchrt$mbrid # create ctrl id | |
# add the matched control's ps to the treatment cohort file joining on the ctrlid | |
trtchrt <- trtchrt%>%left_join(select(ctrlchrt,ctrlid,psctrl), by = "ctrlid") | |
# extract control ID and relevant date that we will use for the index date | |
# this date corresponds to the matched treatment member's program start date and | |
# will be used as the index date for the control member | |
matchindexdt <- trtchrt[c('SVC_FROM_DT','ctrlid')]%>%rename(ctrlindex = SVC_FROM_DT,mbrid =ctrlid) | |
# merge with matched analysis file | |
df1 <- df1%>%left_join(select(matchindexdt,mbrid,ctrlindex), by = "mbrid") | |
# create index date for matched analysis file | |
df1$index_dt <- ifelse(df1$treat,df1$SVC_FROM_DT,df1$ctrlindex) | |
df1$index_dt <- as.Date(df1$index_dt,origin = "1970-01-01") | |
# check a few examples to makes sure matched ids and dates line up across treatments and controls | |
df1[df1$mbrid %in% c('PSID337', 'PSID240', 'PSID256'),]%>%select(mbrid,index_dt) | |
matchids[matchids$ctrlid %in%c('PSID337', 'PSID240', 'PSID256'),] | |
df[df$mbrid %in% c('NSW1','NSW2','NSW3'),] | |
#---------------------------------------- | |
# explore and check balance after matching | |
#---------------------------------------- | |
# check balance | |
summary(m.out) | |
# check both means and sds | |
df1 %>% group_by(treat) %>% # group by treatment | |
summarize(AVGre74 = mean(re74), | |
SDre74 = sd(re74), | |
AVGre75 = mean(re75), | |
SDre75 = sd(re75), | |
AVGage = mean(age), | |
SDage = sd(age), | |
AVGedu = mean(educ), | |
SDedu = sd(educ) | |
) | |
# visualize balance diagnostics | |
plot(m.out) # default QQ plots | |
plot(m.out, type = "hist") # common support of matched groups | |
plot(m.out, type = "jitter") # scatter plots of propensity scores common support | |
# histograms | |
par(mfrow=c(1,2)) | |
hist(df1$re74[df1$treat==1]) | |
hist(df1$re74[df1$treat==0]) | |
dev.off() # reset par | |
### overalapping densities by treatment and control groups | |
ds <- rbind(data.frame(dat=df1[,][,"re74"], grp="All"), | |
data.frame(dat=df1[,][df1$treat==1,"re75"], grp="trt"), | |
data.frame(dat=df1[,][df1$treat==0,"re75"], grp="ctrl")) | |
# histogram for all data | |
hs <- hist(ds[ds$grp=="All",1], main="", xlab="re74", col="grey90", breaks="fd", border=TRUE) | |
# compute density, rescale, and plot for treatment | |
dens <- density(ds[ds$grp=="trt",1], na.rm=TRUE) | |
rs <- max(hs$counts)/max(dens$y) | |
lines(dens$x, dens$y*rs, type="l", col="red") | |
# compute density, rescale, and plot for control | |
dens <- density(ds[ds$grp=="ctrl",1], na.rm=TRUE) | |
rs <- max(hs$counts)/max(dens$y) | |
lines(dens$x, dens$y*rs, type="l", col="green") | |
# Add a legend to the plot. | |
legend("topright", c("All", "trt", "ctrl"), bty="n", fill=c("grey","red","green")) | |
dev.off() | |
#----------------------------------------- | |
# estimating treatment effects | |
#---------------------------------------- | |
# what is the difference between outcomes for matched data | |
df1%>% | |
group_by(treat) %>% # group by treatment | |
summarize(AVGre78 = mean(re78)) | |
# treatment now has much higher value of outcome | |
# t-test | |
t.test(df1$re78[df1$treat==1],df1$re78[df1$treat==0],paired=FALSE) | |
# regression | |
summary(lm(re78 ~ treat, data = df1)) | |
# regression with added controls | |
summary(lm(re78 ~ treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df1)) | |
# regression adjustment does not remove much additional bias or improve significance | |
#--------------------------------------- | |
# sensitivity analysis | |
#-------------------------------------- | |
library(rbounds) | |
# the value of Gamma is interpreted as the odds of treatment assignment hidden bias. | |
# A change in the odds lower/upper bounds from significant to non-significant (or vice-versa) | |
# indicates by how much the odds need to change before the statistical significance of the outcome | |
# shifts. For example, in Figure 17, the lower bound estimate changes from non-significant (0.0541) | |
# to significant (0.0194) when gamma is 1.3. That is, a change of 0.3 in the odds will produce a | |
# change in the significance value. Rosenbaum (2002) defines a study as sensitive if values of | |
# Gamma close to 1 lead to changes in significance compared to those that could be obtained if the | |
# study is free of bias. Thus results will be more robust to hidden bias, if a very large change | |
# in the odds is needed before a change in statistical significance happens. | |
# Sensitivity Test Example https://www.rdocumentation.org/packages/rbounds/versions/2.1/topics/psens | |
trt <- c(38, 23, 41, 18, 37, 36, 23, 62, 31, 34, 24, 14, 21, 17, 16, 20, | |
15, 10, 45, 39, 22, 35, 49, 48, 44, 35, 43, 39, 34, 13, 73, 25, | |
27) | |
ctrl <- c(16, 18, 18, 24, 19, 11, 10, 15, 16, 18, 18, 13, 19, 10, 16, | |
16, 24, 13, 9, 14, 21, 19, 7, 18, 19, 12, 11, 22, 25, 16, 13, | |
11, 13) | |
psens(trt, ctrl) | |
# create trt and control groups sorted by propensity | |
trt <- df1%>% | |
filter(treat==1)%>% | |
arrange(distance) | |
ctrl <- df1%>% | |
filter(treat==0)%>% | |
arrange(distance) | |
trt <- as.vector(trt$re78) | |
ctrl <- as.vector(ctrl$re78) | |
psens(trt, ctrl,Gamma = 2, GammaInc = 0.1) | |
#----------------------------------- | |
# subclassification | |
#----------------------------------- | |
m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "subclass", | |
distance= "logit", subclass = 5) | |
summary(m.out) | |
# estimation of treatment effects | |
df3 <- match.data(m.out) | |
# Subclass-specific t-test for given subclass | |
t.test(re78~treat, data=df3, subset=subclass==1) | |
# Equivalent to subclass-specific t-test for all subclasses | |
summary(lm(re78 ~ subclass + treat + subclass*treat, data = df3)) | |
# ATE estimate (combining subclass-specific estimate) | |
# define N = total number of people | |
N <- dim(df3)[1] | |
# Initialize vectors for subclass-specific effects (“sub.effect”), variances(“sub.var”), | |
# and sample size(“sub.N”) | |
sub.effect <- rep(NA, max(df3$subclass)) | |
sub.var <- rep(NA, max(df3$subclass)) | |
sub.N <- rep(NA, max(df3$subclass)) | |
# Run linear regression model within each subclass | |
for(s in 1:max(df3$subclass)){ | |
tmp <- lm(re78 ~ treat, data=df3, subset=subclass==s) | |
sub.effect[s] <- tmp$coef[2] | |
sub.var[s] <- summary(tmp)$coef[2,2]^2 | |
sub.N[s] <- sum(df3$subclass==s)} | |
# Calculate overall ATE effect | |
ATE.effect <- sum((sub.N/N)*sub.effect) | |
ATE.stderror <- sqrt(sum((sub.N/N)^2*sub.var)) | |
print(ATE.effect) | |
print(ATE.stderror) | |
#----------------------------------- | |
# regression based on trimmed ps | |
#------------------------------------ | |
ps <- glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data = df, family = binomial) | |
summary(ps) | |
df4 <- df # copy analysis file | |
# Attach the predicted propensity score to the datafile | |
df4$pscore <- predict(ps, type = "response") | |
# trim propensity scores | |
summary(df4$pscore) | |
df4%>% | |
summarize(mean = mean(pscore,na.rm = TRUE), | |
min = min(pscore,na.rm = TRUE), | |
P10 = quantile(pscore, probs=0.10,na.rm = TRUE), | |
Q1=quantile(pscore, probs=0.25,na.rm = TRUE), | |
med = median(pscore,na.rm = TRUE), | |
Q3=quantile(pscore, probs=0.75,na.rm = TRUE), | |
P85=quantile(pscore, probs=0.85,na.rm = TRUE), | |
P90=quantile(pscore, probs=0.90,na.rm = TRUE), | |
P95=quantile(pscore, probs=0.95,na.rm = TRUE), | |
P99=quantile(pscore, probs=0.99,na.rm = TRUE)) | |
hist(df4$pscore) | |
dev.off() | |
# restrict data to ps range .10 <= ps <= .90 | |
df5 <- df4[df4$pscore >= .03 & df4$pscore <=.72,] | |
summary(df5$pscore) | |
hist(df5$pscore) | |
# regression with controls on propensity score screened data set | |
summary(lm(re78 ~ treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df5)) | |
# quick balance checks | |
df5 %>% group_by(treat) %>% # group by treatment | |
summarize(AVGre74 = mean(re74), | |
AVGre75 = mean(re75), | |
AVGage = mean(age), | |
AVGedu = mean(educ), | |
AVGblack = mean(black), | |
AVGhisp = mean(hispan), | |
AVGmarr = mean(married), | |
AVGndeg = mean(nodegree) | |
) | |
# compare to unmathced data | |
df %>% group_by(treat) %>% # group by treatment | |
summarize(AVGre74 = mean(re74), | |
AVGre75 = mean(re75), | |
AVGage = mean(age), | |
AVGedu = mean(educ), | |
AVGblack = mean(black), | |
AVGhisp = mean(hispan), | |
AVGmarr = mean(married), | |
AVGndeg = mean(nodegree) | |
) | |
#----------------------------------- | |
# notes | |
#------------------------------------ | |
# https://stats.stackexchange.com/questions/66132/understanding-output-of-matchit-in-r | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment