Skip to content

Instantly share code, notes, and snippets.

@anglilian
Last active March 30, 2020 13:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anglilian/ef2f4be8228d9f80d8cea01085a5fa94 to your computer and use it in GitHub Desktop.
Save anglilian/ef2f4be8228d9f80d8cea01085a5fa94 to your computer and use it in GitHub Desktop.
CS112 Causal Inference Assignment
library(Matching)
library(plyr)
foo <- read.csv(url("https://course-resources.minerva.kgi.edu/uploaded_files/mke/00089202-1711/daughters.csv"))
set.seed(2324)
X = cbind(foo$Dems,foo$Repubs , foo$Christian , foo$age, foo$srvlng , foo$demvote, #added 6 more variables
foo$Catholic, foo$Protestant, foo$OthParty,foo$OtherC, foo$OtherR, foo$none)
Y= foo$nowtot
genout <- GenMatch(Tr = foo$treat, X = X, pop.size = 20, nboots=250)
mout1 <- Match(Tr= foo$treat, X= X, Weight.matrix = genout)
summary(mout1)
MatchBalance(treat ~ Dems +Repubs + Christian + age + srvlng + demvote +Catholic+ Protestant+ OthParty+OtherC+ OtherR+ none,
match.out = mout1, nboots = 250, data =foo)
#Match with Y
mout2 <- Match(Tr= foo$treat, X= X, Y=Y, Weight.matrix = genout)
summary(mout2)
#Creating summary stats table
upperconfint <- as.numeric(mout2[1])+(as.numeric(mout2[2])*1.96) #treatment effect +SE * 1.96
lowerconfint <- as.numeric(mout2[1])-(as.numeric(mout2[2])*1.96)#treatment effect -SE * 1.96
summarystat <- data.frame(mout2[1], mout2[2], lowerconfint, upperconfint)
colnames(summarystat)<- c("Estimated Treatment Effect", "Standard Error", "Lower Confidence Interval", "Upper Confidence Interval" )
library(Matching)
library(plyr)
foo <- read.csv(url("https://course-resources.minerva.kgi.edu/uploaded_files/mke/00089202-1711/daughters.csv"))
##RUN A REGRESSION
set.seed(2324)
glm <- glm(nowtot ~hasgirls +Dems +Repubs + Christian + age + srvlng + demvote, data=foo)
X <- glm$fitted.values
Y <- foo$nowtot
Tr <- foo$hasgirls
rr <- Match(Y=Y, Tr=Tr, X=X, M=1); #to find treatment effect and confidence interval
summary(rr)
#Creating summary stats table
upperconfint <- as.numeric(rr[1])+(as.numeric(rr[2])*1.96) #treatment effect +SE * 1.96
lowerconfint <- as.numeric(rr[1])-(as.numeric(rr[2])*1.96)#treatment effect -SE * 1.96
summarystat <- data.frame(rr[1], rr[2], lowerconfint, upperconfint)
colnames(summarystat)<- c("Estimated Treatment Effect", "Standard Error", "Lower Confidence Interval", "Upper Confidence Interval" )
#Checking the balance
MatchBalance(hasgirls~ Dems +Repubs + Christian + age + srvlng + demvote,
data=foo, match.out = rr)
##GENETIC MATCHING
set.seed(2324)
X = cbind(foo$Dems,foo$Repubs , foo$Christian , foo$age , foo$srvlng , foo$demvote)
Y= foo$nowtot
genout <- GenMatch(Tr = foo$hasgirls, X = X, pop.size = 20, nboots=250)
mout1 <- Match(Tr= foo$hasgirls, X= X, Weight.matrix = genout)
summary(mout1)
MatchBalance(hasgirls ~ Dems +Repubs + Christian + age + srvlng + demvote,data=foo,
match.out = mout1, nboots = 250)
#MatchBalance with Y
mout2 <- Match(Tr= foo$hasgirls, X= X, Y=Y, Weight.matrix = genout)
summary(mout2)
#Creating summary stats table
upperconfint <- as.numeric(mout2[1])+(as.numeric(mout2[2])*1.96) #treatment effect +SE * 1.96
lowerconfint <- as.numeric(mout2[1])-(as.numeric(mout2[2])*1.96)#treatment effect -SE * 1.96
summarystat <- data.frame(mout2[1], mout2[2], lowerconfint, upperconfint)
colnames(summarystat)<- c("Estimated Treatment Effect", "Standard Error", "Lower Confidence Interval", "Upper Confidence Interval" )
library(Matching)
library(plyr)
foo <- read.csv(url("https://course-resources.minerva.kgi.edu/uploaded_files/mke/00089202-1711/daughters.csv"))
set.seed(2324)
foo$treat <- (foo$ngirls>=2 & foo$nboys== 0 ) #changing treated to be having >2 female children and no boys
foo <- foo[-which(foo$treat ==0 & foo$nboys>=2 &foo$ngirls>0),] #excluding all control units with 2 or more boys, but have girls
foo <- foo[-which(foo$treat ==0 & foo$nboys<2),] #excluding all control units with less than 2 boys
X = cbind(foo$Dems,foo$Repubs , foo$Christian ,foo$age , foo$srvlng ,foo$demvote)
Y= foo$nowtot
genout <- GenMatch(Tr = foo$treat, X = X, M=2,pop.size = 20, nboots=250)
mout1 <- Match(Tr= foo$treat,M=2, X= X, Weight.matrix = genout)
summary(mout1)
MatchBalance(foo$treat ~ foo$Dems +foo$Repubs + foo$Christian + foo$age + foo$srvlng + foo$demvote,
match.out = mout1, nboots = 250)
#Match with Y
mout2 <- Match(Tr= foo$treat, X= X, Y=Y, Weight.matrix = genout)
summary(mout2)
#Creating summary stats table
upperconfint <- as.numeric(mout2[1])+(as.numeric(mout2[2])*1.96) #treatment effect +SE * 1.96
lowerconfint <- as.numeric(mout2[1])-(as.numeric(mout2[2])*1.96)#treatment effect -SE * 1.96
summarystat <- data.frame(mout2[1], mout2[2], lowerconfint, upperconfint)
colnames(summarystat)<- c("Estimated Treatment Effect", "Standard Error", "Lower Confidence Interval", "Upper Confidence Interval" )
rm(list=ls())
foo <- read.csv("https://tinyurl.com/y2qv82ks")
# Only install the 2 packages below if not already installed
#install.packages("date")
#install.packages("Matching")
library(date)
library(Matching)
# dim(foo)
# 76772 14
names(foo)
#[1] "bank_code" [2] "bank_name" [3] "date_of_birth"
#[4] "gender" [5] "marital_status" [6] "education"
#[7] "occupation" [8] "postal_code" [9] "district_code"
#[10] "worker" [11] "capital" [12] "credit_proposal"
#[13] "status" [14] "randomid"
# vars 1, 2, and 3 have 17,330, 68,029, and 1108 blanks (""), respectively
# vars 8 and 11 have 13,118 and 14,642 NAs, respectively
# create dummies indicating where the blanks and NAs are.
missing_bank_code <- rep(0, 76772)
missing_bank_name <- rep(0, 76772)
missing_date_of_birth <- rep(0, 76772)
NA_postal_code <- rep(0, 76772)
NA_capital <- rep(0, 76772)
NA_credit_proposal <- rep(0, 76772)
foo <- cbind(foo, missing_bank_code,
missing_bank_name,
missing_date_of_birth,
NA_postal_code,
NA_capital,
NA_credit_proposal)
foo$missing_bank_code[which(foo$bank_code == "")] <- 1
foo$missing_bank_name[which(foo$bank_name == "")] <- 1
foo$missing_date_of_birth[which(foo$date_of_birth == "")] <- 1
foo$NA_capital[which(is.na(foo$capital) == TRUE)] <- 1
foo$NA_credit_proposal[which(is.na(foo$credit_proposal) == TRUE)] <- 1
foo$NA_postal_code[which(is.na(foo$postal_code) == TRUE)] <- 1
# change the dates to R-readable format
foo$R_date_of_birth <- as.character(foo[,3])
for(i in 1:length(foo[,3])) {foo$R_date_of_birth[i] <- as.date(foo$R_date_of_birth[i], order =
"dmy")}
foo$R_date_of_birth <- as.date(as.numeric(foo$R_date_of_birth))
oldest <- which(foo$R_date_of_birth < as.date("1-Jan-1910"))
youngest <- which(foo$R_date_of_birth > as.date("1 Jan 2001"))
foo$oldest <- rep(0, length(foo[,3]))
foo$youngest <- rep(0, length(foo[,3]))
foo$outlier_ages <- rep(0, length(foo[,3]))
foo$oldest[oldest] <- 1
foo$youngest[youngest] <- 1
foo$outlier_ages[c(oldest,youngest)] <- 1
foo$R_date_of_birth[which(is.na(foo$R_date_of_birth) == TRUE)] <- -9999999
# This obs with specific postal code makes no sense
foo <- foo[-which(foo$postal_code == 9151), ]
# To extract only the first digit of postal codes:
foo$postal_code1 <- foo$postal_code%/% 10000
foo$postal_code1[which(is.na(foo$postal_code1) == TRUE)] <- -9999999
# credit_proposal feature engineering
foo$credit_proposal[which(is.na(foo$credit_proposal) == TRUE)] <- 9999999
foo$credit_proposal_0 <- foo$credit_proposal == 0 & (is.na(foo$credit_proposal) == FALSE)
foo$credit_proposal_0to5 <- foo$credit_proposal > 0 & foo$credit_proposal < 5000000 &
(is.na(foo$credit_proposal) == FALSE)
foo$credit_proposal_5to10 <- foo$credit_proposal >= 5000000 & foo$credit_proposal < 10000000 &
(is.na(foo$credit_proposal) == FALSE)
foo$credit_proposal_10to20 <- foo$credit_proposal >= 10000000 & foo$credit_proposal < 20000000 &
(is.na(foo$credit_proposal) == FALSE)
foo$credit_proposal_20up <- foo$credit_proposal >= 20000000 & (is.na(foo$credit_proposal) ==
FALSE)
foo$credit_proposal_transformed <-
1*foo$credit_proposal_0 +
2*foo$credit_proposal_0to5 +
3*foo$credit_proposal_5to10 +
4*foo$credit_proposal_10to20 +
5*foo$credit_proposal_20up +
6*foo$NA_credit_proposal
# NA capital
foo$capital[which(is.na(foo$capital) == TRUE)] <- 9999999
# capital feature engineering
foo$capital_0 <- foo$capital == 0 & (is.na(foo$capital) == FALSE)
foo$capital_0to2 <- foo$capital > 0 & foo$capital < 200000 & (is.na(foo$capital) == FALSE)
foo$capital_2to5 <- foo$capital >= 200000 & foo$capital < 500000 & (is.na(foo$capital) == FALSE)
foo$capital_5to10 <- foo$capital >= 500000 & foo$capital < 1000000 & (is.na(foo$capital) ==
FALSE)
foo$capital_10to20 <- foo$capital >= 1000000 & foo$capital < 2000000 & (is.na(foo$capital) ==
FALSE)
foo$capital_20to50 <- foo$capital >= 2000000 & foo$capital < 5000000 & (is.na(foo$capital) ==
FALSE)
foo$capital_50up <- foo$capital >= 5000000 & (is.na(foo$capital) == FALSE)
foo$capital_transformed <-
1*foo$capital_0 +
2*foo$capital_0to2 +
3*foo$capital_2to5 +
4*foo$capital_5to10 +
5*foo$capital_10to20 +
6*foo$capital_20to50 +
7*foo$capital_50up +
8*foo$NA_capital
# worker feature engineering
# remove outlier in the control group (10 million workers)
foo <- foo[-which(foo$worker == max(foo$worker)),]
foo$worker_0 <- foo$worker == 0
foo$worker_1 <- foo$worker == 1
foo$worker_2 <- foo$worker == 2
foo$worker_3 <- foo$worker == 3
foo$worker_4 <- foo$worker == 4
foo$worker_5to9 <- foo$worker >=5 & foo$worker < 10
foo$worker_10to24 <- foo$worker >=10 & foo$worker < 25
foo$worker_25to99 <- foo$worker >=25 & foo$worker < 100
foo$worker_100up <- foo$worker >= 100
foo$worker_transformed <-
1*foo$worker_0 +
2*foo$worker_1 +
3*foo$worker_2 +
4*foo$worker_3 +
5*foo$worker_4 +
6*foo$worker_5to9 +
7*foo$worker_10to24 +
8*foo$worker_25to99 +
9*foo$worker_100up
# Treatment Indicator
foo$treat <- foo$status == "Sudah"
# The only bank code (var 1) is PEM
# The only bank name (var 2) is PEMDA -- regional government, not a bank
# (var 3) 174 outlier ages
# (var 4) gender is 3 levels: men, women, and business entity ("LAKI-LAKI" "PEREMPUAN" "BADAN USAHA")
# (var 5) marital_status: business agency, single, married ("BADAN USAHA" "BELUM KAWIN" "KAWIN")
### (var 6) EDUCATION
# "BADAN USAHA" "DIPLOMA" "LAINNYA" "SARJANA" "SD" "SMP" "SMU"
# "BUSINESS AGENCY" "DIPLOMA" "OTHER" "GRADUATE" "Elementary School" "Middle School" "High School"
### (var 7) OCCUPATION
#[1] "KARYAWAN SWASTA" "LAIN-LAIN/BADAN USAHA" "NELAYAN" "PEDAGANG"
#[5] "PENSIUNAN/PURNAWIRAWAN" "PETANI" "PNS" "PROFESIONAL"
#[9] "TNI/POLRI" "WIRASWASTA"
#[1] "PRIVATE EMPLOYEES" [2] "OTHERS / BUSINESS AGENCIES" [3] "FISHERMANS" [4] "TRADERS"
#[5] "PENSIONERS / PURNAWIRAWAN" [6] "FARMERS" [7] "CIVIL SERVANT" [8] "PROFESSIONAL"
#[9] "ARMY / POLICE" [10] "ENTREPRENEURS"
### (var 8) 938 unique POSTAL CODES
### (var 9) 63 unique DISTRICT CODES
### (var 10) : "worker". unknown meaning... 300+ unique numerical values (maybe no. of staff?)
### (var 11) (capital)
# 0% 25% 50% 75% 100%
# 0.00e+00 1.00e+06 1.00e+07 1.50e+07 1.75e+11
### (var 12) (credit proposal)
# 0% 25% 50% 75% 100%
# 0.0e+00 0.0e+00 1.0e+07 2.5e+07 5.0e+09
### (var 13) STATUS
# "Belum" "Sudah"
# "Not Yet" "Already"
### (var 14) randomid
foo_badan <- foo[which(foo$gender == "BADAN USAHA"), ]
foo_people <- foo[-which(foo$gender == "BADAN USAHA"), ]
######## CODE MODIFICATION SHOULD BEGIN HERE...
foo$district_code2 <- substr(foo$district_code,0,2) #taking the first two digits from the district code
foo$district_code2<-as.numeric(foo$district_code2)
X = cbind(foo$R_date_of_birth, foo$gender, foo$marital_status,
foo$education, foo$occupation, foo$district_code2, #swapped out postal code for district code
foo$worker, foo$capital, foo$credit_proposal,
foo$worker_transformed, foo$capital_transformed, foo$credit_proposal_transformed,
foo$missing_date_of_birth,
foo$NA_postal_code,
foo$NA_capital,
foo$NA_credit_proposal)
##match on date_of_birth within a 1-year (365-day) caliper
caliper <- 1/(sd(foo$R_date_of_birth)/365) #finding the standard deviation of a year (365 days)
Tr <- foo$treat
BalanceMat <- X
#
library(rgenoud)
genout <- GenMatch(Tr=Tr, X=X, BalanceMatrix=BalanceMat, estimand="ATT", M=1,
pop.size=10, max.generations=2, wait.generations=1,
#decreased pop.size, max.generations & wait.generations to reduce wait time to 1.5 hours
caliper = c(caliper,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
NULL,NULL), #adding caliper only for the date of birth
exact = c(FALSE, TRUE, TRUE,
TRUE, TRUE, TRUE,
FALSE, FALSE, FALSE,
TRUE, TRUE, TRUE,
TRUE,
TRUE,
TRUE,
TRUE))
#
mout <- Match(Tr=Tr, X=X, estimand="ATT", M=1,
caliper = c(caliper,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
NULL,NULL), #adding caliper only for the date of birth
exact = c(FALSE, TRUE, TRUE,
TRUE, TRUE, TRUE,
FALSE, FALSE, FALSE,
TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE),
Weight.matrix = genout) #swapped the weight matrix with the new weights
summary(mout)
mb <- MatchBalance(foo$treat~
foo$R_date_of_birth + foo$gender + foo$marital_status +
foo$education + foo$occupation + foo$district_code2 + #swapped postal code for district code
foo$worker + foo$capital + foo$credit_proposal +
foo$worker_transformed + foo$capital_transformed +
foo$credit_proposal_transformed +
foo$missing_date_of_birth +
foo$NA_postal_code +
foo$NA_capital +
foo$NA_credit_proposal,
match.out=mout, nboots=500)
library(agridat)
library(Matching)
library(sensitivitymv)
library(rbounds)
foo <- diggle.cow
#adjusting for NA values
foo$weight[which(is.na(foo$weight)==TRUE)]<- -999
#treatment group interpreted as getting both of the randomised treatments
foo$treat <- foo$infect == 'Infected' & foo$iron=="Iron"
set.seed(2020)
glm <- glm(foo$weight ~ foo$treat) #regression with treatment as dependent variable
X <- glm$fitted
Y <- foo$weight
Tr<- foo$treat
genout <- GenMatch(X=X, Tr=Tr, M=1, estimand ="ATT",
pop.size = 100, max.generations = 10,
wait.generations = 1, ties =FALSE) #chose from ties based on order for faster runtime
mout<- Match(Y=Y, Tr=Tr, X=X, M=1, estimand = 'ATT',
Weight.matrix = genout, ties =FALSE) #chose from ties based on order for faster runtime
psens(mout, Gamma=1.5, GammaInc=.1) #sensitivity analysis
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment