anglilian/Q1_Bonus.r

## Q1_Bonus.r
library(Matching)
library(plyr)
foo <- read.csv(url("https://course-resources.minerva.kgi.edu/uploaded_files/mke/00089202-1711/daughters.csv"))
set.seed(2324)

X = cbind(foo$Dems,foo$Repubs , foo$Christian , foo$age, foo$srvlng , foo$demvote,  #added 6 more variables
          foo$Catholic, foo$Protestant, foo$OthParty,foo$OtherC, foo$OtherR, foo$none)
Y= foo$nowtot

genout <- GenMatch(Tr = foo$treat, X = X, pop.size = 20, nboots=250)

mout1 <- Match(Tr= foo$treat, X= X, Weight.matrix = genout)
summary(mout1)

MatchBalance(treat ~ Dems +Repubs + Christian + age  + srvlng + demvote +Catholic+ Protestant+ OthParty+OtherC+ OtherR+ none,
             match.out = mout1, nboots = 250, data =foo)

#Match with Y
mout2 <- Match(Tr= foo$treat, X= X, Y=Y, Weight.matrix = genout)
summary(mout2)

#Creating summary stats table
upperconfint <- as.numeric(mout2[1])+(as.numeric(mout2[2])*1.96) #treatment effect +SE * 1.96
lowerconfint <- as.numeric(mout2[1])-(as.numeric(mout2[2])*1.96)#treatment effect -SE * 1.96
summarystat <- data.frame(mout2[1], mout2[2], lowerconfint, upperconfint)
colnames(summarystat)<- c("Estimated Treatment Effect", "Standard Error", "Lower Confidence Interval", "Upper Confidence Interval" )

## Q1_PartA.r
library(Matching)
library(plyr)
foo <- read.csv(url("https://course-resources.minerva.kgi.edu/uploaded_files/mke/00089202-1711/daughters.csv"))

##RUN A REGRESSION
set.seed(2324)

glm <- glm(nowtot ~hasgirls +Dems +Repubs + Christian + age  + srvlng + demvote, data=foo)

X <- glm$fitted.values

Y <- foo$nowtot

Tr <- foo$hasgirls

rr <- Match(Y=Y, Tr=Tr, X=X, M=1); #to find treatment effect and confidence interval
summary(rr)

#Creating summary stats table
upperconfint <- as.numeric(rr[1])+(as.numeric(rr[2])*1.96) #treatment effect +SE * 1.96
lowerconfint <- as.numeric(rr[1])-(as.numeric(rr[2])*1.96)#treatment effect -SE * 1.96
summarystat <- data.frame(rr[1], rr[2], lowerconfint, upperconfint)
colnames(summarystat)<- c("Estimated Treatment Effect", "Standard Error", "Lower Confidence Interval", "Upper Confidence Interval" )

#Checking the balance
MatchBalance(hasgirls~ Dems +Repubs + Christian + age  + srvlng + demvote,
             data=foo, match.out = rr)

##GENETIC MATCHING
set.seed(2324)

X = cbind(foo$Dems,foo$Repubs , foo$Christian , foo$age  , foo$srvlng , foo$demvote)
Y= foo$nowtot

genout <- GenMatch(Tr = foo$hasgirls, X = X, pop.size = 20, nboots=250)

mout1 <- Match(Tr= foo$hasgirls, X= X, Weight.matrix = genout)
summary(mout1)

MatchBalance(hasgirls ~ Dems +Repubs + Christian + age  + srvlng + demvote,data=foo,
             match.out = mout1, nboots = 250)

#MatchBalance with Y
mout2 <- Match(Tr= foo$hasgirls, X= X, Y=Y, Weight.matrix = genout)
summary(mout2)

#Creating summary stats table
upperconfint <- as.numeric(mout2[1])+(as.numeric(mout2[2])*1.96) #treatment effect +SE * 1.96
lowerconfint <- as.numeric(mout2[1])-(as.numeric(mout2[2])*1.96)#treatment effect -SE * 1.96
summarystat <- data.frame(mout2[1], mout2[2], lowerconfint, upperconfint)
colnames(summarystat)<- c("Estimated Treatment Effect", "Standard Error", "Lower Confidence Interval", "Upper Confidence Interval" )

## Q1_PartB.r
library(Matching)
library(plyr)
foo <- read.csv(url("https://course-resources.minerva.kgi.edu/uploaded_files/mke/00089202-1711/daughters.csv"))
set.seed(2324)

foo$treat <- (foo$ngirls>=2 & foo$nboys== 0 ) #changing treated to be having >2 female children and no boys
foo <- foo[-which(foo$treat ==0 & foo$nboys>=2 &foo$ngirls>0),] #excluding all control units with 2 or more boys, but have girls
foo <- foo[-which(foo$treat ==0 & foo$nboys<2),] #excluding all control units with less than 2 boys
X = cbind(foo$Dems,foo$Repubs , foo$Christian ,foo$age  , foo$srvlng ,foo$demvote)

Y= foo$nowtot

genout <- GenMatch(Tr = foo$treat, X = X, M=2,pop.size = 20, nboots=250)

mout1 <- Match(Tr= foo$treat,M=2, X= X, Weight.matrix = genout)
summary(mout1)

MatchBalance(foo$treat ~ foo$Dems +foo$Repubs + foo$Christian + foo$age  + foo$srvlng + foo$demvote,
             match.out = mout1, nboots = 250)

#Match with Y
mout2 <- Match(Tr= foo$treat, X= X, Y=Y, Weight.matrix = genout)
summary(mout2)

#Creating summary stats table
upperconfint <- as.numeric(mout2[1])+(as.numeric(mout2[2])*1.96) #treatment effect +SE * 1.96
lowerconfint <- as.numeric(mout2[1])-(as.numeric(mout2[2])*1.96)#treatment effect -SE * 1.96
summarystat <- data.frame(mout2[1], mout2[2], lowerconfint, upperconfint)
colnames(summarystat)<- c("Estimated Treatment Effect", "Standard Error", "Lower Confidence Interval", "Upper Confidence Interval" )

## Q2.r
rm(list=ls())

foo <- read.csv("https://tinyurl.com/y2qv82ks")

# Only install the 2 packages below if not already installed
#install.packages("date")
#install.packages("Matching")

library(date)
library(Matching)

# dim(foo)
# 76772    14

names(foo)
#[1] "bank_code"      [2] "bank_name"      [3] "date_of_birth"
#[4] "gender"         [5] "marital_status" [6]  "education"
#[7] "occupation"     [8] "postal_code"    [9]  "district_code"
#[10] "worker"        [11] "capital"       [12] "credit_proposal"
#[13] "status"        [14] "randomid"

# vars 1, 2, and 3 have 17,330, 68,029, and 1108 blanks (""), respectively
# vars 8 and 11 have 13,118 and 14,642 NAs, respectively

# create dummies indicating where the blanks and NAs are.
missing_bank_code <- rep(0, 76772)
missing_bank_name <- rep(0, 76772)
missing_date_of_birth <- rep(0, 76772)
NA_postal_code <- rep(0, 76772)
NA_capital <- rep(0, 76772)
NA_credit_proposal <- rep(0, 76772)

foo <- cbind(foo, missing_bank_code,
             missing_bank_name,
             missing_date_of_birth,
             NA_postal_code,
             NA_capital,
             NA_credit_proposal)

foo$missing_bank_code[which(foo$bank_code == "")] <- 1
foo$missing_bank_name[which(foo$bank_name == "")] <- 1
foo$missing_date_of_birth[which(foo$date_of_birth == "")] <- 1
foo$NA_capital[which(is.na(foo$capital) == TRUE)] <- 1
foo$NA_credit_proposal[which(is.na(foo$credit_proposal) == TRUE)] <- 1
foo$NA_postal_code[which(is.na(foo$postal_code) == TRUE)] <- 1

# change the dates to R-readable format
foo$R_date_of_birth <- as.character(foo[,3])
for(i in 1:length(foo[,3])) {foo$R_date_of_birth[i] <- as.date(foo$R_date_of_birth[i], order =
                                                                 "dmy")}
foo$R_date_of_birth <- as.date(as.numeric(foo$R_date_of_birth))

oldest <- which(foo$R_date_of_birth < as.date("1-Jan-1910"))
youngest <- which(foo$R_date_of_birth > as.date("1 Jan 2001"))

foo$oldest <- rep(0, length(foo[,3]))
foo$youngest <- rep(0, length(foo[,3]))
foo$outlier_ages <- rep(0, length(foo[,3]))
foo$oldest[oldest] <- 1
foo$youngest[youngest] <- 1
foo$outlier_ages[c(oldest,youngest)] <- 1

foo$R_date_of_birth[which(is.na(foo$R_date_of_birth) == TRUE)] <- -9999999

# This obs with specific postal code makes no sense
foo <- foo[-which(foo$postal_code == 9151), ]

# To extract only the first digit of postal codes:
foo$postal_code1 <- foo$postal_code%/% 10000
foo$postal_code1[which(is.na(foo$postal_code1) == TRUE)] <- -9999999

# credit_proposal feature engineering
foo$credit_proposal[which(is.na(foo$credit_proposal) == TRUE)] <- 9999999

foo$credit_proposal_0 <- foo$credit_proposal == 0 & (is.na(foo$credit_proposal) == FALSE)
foo$credit_proposal_0to5 <- foo$credit_proposal > 0 & foo$credit_proposal < 5000000 &
  (is.na(foo$credit_proposal) == FALSE)
foo$credit_proposal_5to10 <- foo$credit_proposal >= 5000000 & foo$credit_proposal < 10000000 &
  (is.na(foo$credit_proposal) == FALSE)
foo$credit_proposal_10to20 <- foo$credit_proposal >= 10000000 & foo$credit_proposal < 20000000 &
  (is.na(foo$credit_proposal) == FALSE)
foo$credit_proposal_20up <- foo$credit_proposal >= 20000000 & (is.na(foo$credit_proposal) ==
                                                                 FALSE)

foo$credit_proposal_transformed <-
  1*foo$credit_proposal_0 +
  2*foo$credit_proposal_0to5 +
  3*foo$credit_proposal_5to10 +
  4*foo$credit_proposal_10to20 +
  5*foo$credit_proposal_20up +
  6*foo$NA_credit_proposal

# NA capital
foo$capital[which(is.na(foo$capital) == TRUE)] <- 9999999

# capital feature engineering
foo$capital_0 <- foo$capital == 0 & (is.na(foo$capital) == FALSE)
foo$capital_0to2 <- foo$capital > 0 & foo$capital < 200000 & (is.na(foo$capital) == FALSE)
foo$capital_2to5 <- foo$capital >= 200000 & foo$capital < 500000 & (is.na(foo$capital) == FALSE)
foo$capital_5to10 <- foo$capital >= 500000 & foo$capital < 1000000 & (is.na(foo$capital) ==
                                                                        FALSE)
foo$capital_10to20 <- foo$capital >= 1000000 & foo$capital < 2000000 & (is.na(foo$capital) ==
                                                                          FALSE)
foo$capital_20to50 <- foo$capital >= 2000000 & foo$capital < 5000000 & (is.na(foo$capital) ==
                                                                          FALSE)
foo$capital_50up <- foo$capital >= 5000000 & (is.na(foo$capital) == FALSE)
foo$capital_transformed <-
  1*foo$capital_0 +
  2*foo$capital_0to2 +
  3*foo$capital_2to5 +
  4*foo$capital_5to10 +
  5*foo$capital_10to20 +
  6*foo$capital_20to50 +
  7*foo$capital_50up +
  8*foo$NA_capital

# worker feature engineering
# remove outlier in the control group (10 million workers)
foo <- foo[-which(foo$worker == max(foo$worker)),]

foo$worker_0 <- foo$worker == 0
foo$worker_1 <- foo$worker == 1
foo$worker_2 <- foo$worker == 2
foo$worker_3 <- foo$worker == 3
foo$worker_4 <- foo$worker == 4
foo$worker_5to9 <- foo$worker >=5 & foo$worker < 10
foo$worker_10to24 <- foo$worker >=10 & foo$worker < 25
foo$worker_25to99 <- foo$worker >=25 & foo$worker < 100
foo$worker_100up <- foo$worker >= 100

foo$worker_transformed <-
  1*foo$worker_0 +
  2*foo$worker_1 +
  3*foo$worker_2 +
  4*foo$worker_3 +
  5*foo$worker_4 +
  6*foo$worker_5to9 +
  7*foo$worker_10to24 +
  8*foo$worker_25to99 +
  9*foo$worker_100up


# Treatment Indicator
foo$treat <- foo$status == "Sudah"

# The only bank code (var 1) is PEM
# The only bank name (var 2) is PEMDA -- regional government, not a bank
# (var 3) 174 outlier ages
# (var 4) gender is 3 levels: men, women, and business entity ("LAKI-LAKI"   "PEREMPUAN"   "BADAN USAHA")
# (var 5) marital_status: business agency, single, married ("BADAN USAHA" "BELUM KAWIN" "KAWIN")

### (var 6) EDUCATION
# "BADAN USAHA" "DIPLOMA"     "LAINNYA"     "SARJANA"     "SD"          "SMP"         "SMU"
# "BUSINESS AGENCY" "DIPLOMA" "OTHER" "GRADUATE" "Elementary School" "Middle School" "High School"

### (var 7) OCCUPATION
#[1] "KARYAWAN SWASTA"        "LAIN-LAIN/BADAN USAHA"  "NELAYAN"                "PEDAGANG"
#[5] "PENSIUNAN/PURNAWIRAWAN" "PETANI"                 "PNS"                    "PROFESIONAL"
#[9] "TNI/POLRI"              "WIRASWASTA"

#[1] "PRIVATE EMPLOYEES" [2] "OTHERS / BUSINESS AGENCIES" [3] "FISHERMANS" [4] "TRADERS"
#[5] "PENSIONERS / PURNAWIRAWAN" [6] "FARMERS" [7] "CIVIL SERVANT" [8] "PROFESSIONAL"
#[9] "ARMY / POLICE" [10] "ENTREPRENEURS"

### (var 8) 938 unique POSTAL CODES

### (var 9) 63 unique DISTRICT CODES

### (var 10) : "worker". unknown meaning... 300+ unique numerical values (maybe no. of staff?)

### (var 11) (capital)
# 0%      25%      50%      75%     100%
# 0.00e+00 1.00e+06 1.00e+07 1.50e+07 1.75e+11

### (var 12) (credit proposal)
# 0%     25%     50%     75%    100%
# 0.0e+00 0.0e+00 1.0e+07 2.5e+07 5.0e+09

### (var 13) STATUS
# "Belum"    "Sudah"
# "Not Yet"  "Already"

### (var 14) randomid

foo_badan <- foo[which(foo$gender == "BADAN USAHA"), ]
foo_people <- foo[-which(foo$gender == "BADAN USAHA"), ]

######## CODE MODIFICATION SHOULD BEGIN HERE...
foo$district_code2 <- substr(foo$district_code,0,2) #taking the first two digits from the district code
foo$district_code2<-as.numeric(foo$district_code2)

X = cbind(foo$R_date_of_birth, foo$gender, foo$marital_status,
                foo$education, foo$occupation, foo$district_code2, #swapped out postal code for district code
                foo$worker, foo$capital, foo$credit_proposal,
                foo$worker_transformed, foo$capital_transformed, foo$credit_proposal_transformed,
                foo$missing_date_of_birth,
                foo$NA_postal_code,
                foo$NA_capital,
                foo$NA_credit_proposal)

##match on date_of_birth within a 1-year (365-day) caliper
caliper <- 1/(sd(foo$R_date_of_birth)/365) #finding the standard deviation of a year (365 days)

Tr <- foo$treat

BalanceMat <- X
#
library(rgenoud)
genout <- GenMatch(Tr=Tr, X=X, BalanceMatrix=BalanceMat, estimand="ATT", M=1,
                          pop.size=10, max.generations=2, wait.generations=1,
                   #decreased pop.size, max.generations & wait.generations to reduce wait time to 1.5 hours
                   caliper = c(caliper,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
                               NULL,NULL),  #adding caliper only for the date of birth
                          exact = c(FALSE, TRUE, TRUE,
                                    TRUE, TRUE, TRUE,
                                    FALSE, FALSE, FALSE,
                                    TRUE, TRUE, TRUE,
                                    TRUE,
                                    TRUE,
                                    TRUE,
                                    TRUE))
#


mout <- Match(Tr=Tr, X=X, estimand="ATT", M=1,
             caliper = c(caliper,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
                          NULL,NULL),  #adding caliper only for the date of birth
                     exact = c(FALSE, TRUE, TRUE,
                              TRUE, TRUE, TRUE,
                              FALSE, FALSE, FALSE,
                              TRUE, TRUE, TRUE,
                              TRUE, TRUE, TRUE, TRUE),
              Weight.matrix = genout) #swapped the weight matrix with the new weights
  summary(mout)


  mb <- MatchBalance(foo$treat~
                            	foo$R_date_of_birth + foo$gender + foo$marital_status +
                            	foo$education + foo$occupation + foo$district_code2 + #swapped postal code for district code
                            	foo$worker + foo$capital + foo$credit_proposal +
                            	foo$worker_transformed + foo$capital_transformed +
foo$credit_proposal_transformed +
                            	foo$missing_date_of_birth +
                            	foo$NA_postal_code +
                            	foo$NA_capital +
                            	foo$NA_credit_proposal,
                         	match.out=mout, nboots=500)

## Q3.r
library(agridat)
library(Matching)
library(sensitivitymv)
library(rbounds)
foo <- diggle.cow

#adjusting for NA values
foo$weight[which(is.na(foo$weight)==TRUE)]<- -999

#treatment group interpreted as getting both of the randomised treatments
foo$treat <- foo$infect == 'Infected' & foo$iron=="Iron"

set.seed(2020)
glm <- glm(foo$weight ~ foo$treat) #regression with treatment as dependent variable

X <- glm$fitted
Y <- foo$weight
Tr<- foo$treat

genout <- GenMatch(X=X, Tr=Tr, M=1, estimand ="ATT",
                   pop.size = 100, max.generations = 10,
                   wait.generations = 1, ties =FALSE) #chose from ties based on order for faster runtime

mout<- Match(Y=Y, Tr=Tr, X=X, M=1, estimand = 'ATT',
             Weight.matrix = genout, ties =FALSE) #chose from ties based on order for faster runtime

psens(mout, Gamma=1.5, GammaInc=.1) #sensitivity analysis
	library(Matching)
	library(plyr)
	foo <- read.csv(url("https://course-resources.minerva.kgi.edu/uploaded_files/mke/00089202-1711/daughters.csv"))
	set.seed(2324)

	X = cbind(foo$Dems,foo$Repubs , foo$Christian , foo$age, foo$srvlng , foo$demvote, #added 6 more variables
	foo$Catholic, foo$Protestant, foo$OthParty,foo$OtherC, foo$OtherR, foo$none)
	Y= foo$nowtot

	genout <- GenMatch(Tr = foo$treat, X = X, pop.size = 20, nboots=250)

	mout1 <- Match(Tr= foo$treat, X= X, Weight.matrix = genout)
	summary(mout1)

	MatchBalance(treat ~ Dems +Repubs + Christian + age + srvlng + demvote +Catholic+ Protestant+ OthParty+OtherC+ OtherR+ none,
	match.out = mout1, nboots = 250, data =foo)

	#Match with Y
	mout2 <- Match(Tr= foo$treat, X= X, Y=Y, Weight.matrix = genout)
	summary(mout2)

	#Creating summary stats table
	upperconfint <- as.numeric(mout2[1])+(as.numeric(mout2[2])1.96) #treatment effect +SE 1.96
	lowerconfint <- as.numeric(mout2[1])-(as.numeric(mout2[2])1.96)#treatment effect -SE 1.96
	summarystat <- data.frame(mout2[1], mout2[2], lowerconfint, upperconfint)
	colnames(summarystat)<- c("Estimated Treatment Effect", "Standard Error", "Lower Confidence Interval", "Upper Confidence Interval" )
	rm(list=ls())

	foo <- read.csv("https://tinyurl.com/y2qv82ks")

	# Only install the 2 packages below if not already installed
	#install.packages("date")
	#install.packages("Matching")

	library(date)
	library(Matching)

	# dim(foo)
	# 76772 14

	names(foo)
	#[1] "bank_code" [2] "bank_name" [3] "date_of_birth"
	#[4] "gender" [5] "marital_status" [6] "education"
	#[7] "occupation" [8] "postal_code" [9] "district_code"
	#[10] "worker" [11] "capital" [12] "credit_proposal"
	#[13] "status" [14] "randomid"

	# vars 1, 2, and 3 have 17,330, 68,029, and 1108 blanks (""), respectively
	# vars 8 and 11 have 13,118 and 14,642 NAs, respectively

	# create dummies indicating where the blanks and NAs are.
	missing_bank_code <- rep(0, 76772)
	missing_bank_name <- rep(0, 76772)
	missing_date_of_birth <- rep(0, 76772)
	NA_postal_code <- rep(0, 76772)
	NA_capital <- rep(0, 76772)
	NA_credit_proposal <- rep(0, 76772)

	foo <- cbind(foo, missing_bank_code,
	missing_bank_name,
	missing_date_of_birth,
	NA_postal_code,
	NA_capital,
	NA_credit_proposal)

	foo$missing_bank_code[which(foo$bank_code == "")] <- 1
	foo$missing_bank_name[which(foo$bank_name == "")] <- 1
	foo$missing_date_of_birth[which(foo$date_of_birth == "")] <- 1
	foo$NA_capital[which(is.na(foo$capital) == TRUE)] <- 1
	foo$NA_credit_proposal[which(is.na(foo$credit_proposal) == TRUE)] <- 1
	foo$NA_postal_code[which(is.na(foo$postal_code) == TRUE)] <- 1

	# change the dates to R-readable format
	foo$R_date_of_birth <- as.character(foo[,3])
	for(i in 1:length(foo[,3])) {foo$R_date_of_birth[i] <- as.date(foo$R_date_of_birth[i], order =
	"dmy")}
	foo$R_date_of_birth <- as.date(as.numeric(foo$R_date_of_birth))

	oldest <- which(foo$R_date_of_birth < as.date("1-Jan-1910"))
	youngest <- which(foo$R_date_of_birth > as.date("1 Jan 2001"))

	foo$oldest <- rep(0, length(foo[,3]))
	foo$youngest <- rep(0, length(foo[,3]))
	foo$outlier_ages <- rep(0, length(foo[,3]))
	foo$oldest[oldest] <- 1
	foo$youngest[youngest] <- 1
	foo$outlier_ages[c(oldest,youngest)] <- 1

	foo$R_date_of_birth[which(is.na(foo$R_date_of_birth) == TRUE)] <- -9999999

	# This obs with specific postal code makes no sense
	foo <- foo[-which(foo$postal_code == 9151), ]

	# To extract only the first digit of postal codes:
	foo$postal_code1 <- foo$postal_code%/% 10000
	foo$postal_code1[which(is.na(foo$postal_code1) == TRUE)] <- -9999999

	# credit_proposal feature engineering
	foo$credit_proposal[which(is.na(foo$credit_proposal) == TRUE)] <- 9999999

	foo$credit_proposal_0 <- foo$credit_proposal == 0 & (is.na(foo$credit_proposal) == FALSE)
	foo$credit_proposal_0to5 <- foo$credit_proposal > 0 & foo$credit_proposal < 5000000 &
	(is.na(foo$credit_proposal) == FALSE)
	foo$credit_proposal_5to10 <- foo$credit_proposal >= 5000000 & foo$credit_proposal < 10000000 &
	(is.na(foo$credit_proposal) == FALSE)
	foo$credit_proposal_10to20 <- foo$credit_proposal >= 10000000 & foo$credit_proposal < 20000000 &
	(is.na(foo$credit_proposal) == FALSE)
	foo$credit_proposal_20up <- foo$credit_proposal >= 20000000 & (is.na(foo$credit_proposal) ==
	FALSE)

	foo$credit_proposal_transformed <-
	1*foo$credit_proposal_0 +
	2*foo$credit_proposal_0to5 +
	3*foo$credit_proposal_5to10 +
	4*foo$credit_proposal_10to20 +
	5*foo$credit_proposal_20up +
	6*foo$NA_credit_proposal

	# NA capital
	foo$capital[which(is.na(foo$capital) == TRUE)] <- 9999999

	# capital feature engineering
	foo$capital_0 <- foo$capital == 0 & (is.na(foo$capital) == FALSE)
	foo$capital_0to2 <- foo$capital > 0 & foo$capital < 200000 & (is.na(foo$capital) == FALSE)
	foo$capital_2to5 <- foo$capital >= 200000 & foo$capital < 500000 & (is.na(foo$capital) == FALSE)
	foo$capital_5to10 <- foo$capital >= 500000 & foo$capital < 1000000 & (is.na(foo$capital) ==
	FALSE)
	foo$capital_10to20 <- foo$capital >= 1000000 & foo$capital < 2000000 & (is.na(foo$capital) ==
	FALSE)
	foo$capital_20to50 <- foo$capital >= 2000000 & foo$capital < 5000000 & (is.na(foo$capital) ==
	FALSE)
	foo$capital_50up <- foo$capital >= 5000000 & (is.na(foo$capital) == FALSE)
	foo$capital_transformed <-
	1*foo$capital_0 +
	2*foo$capital_0to2 +
	3*foo$capital_2to5 +
	4*foo$capital_5to10 +
	5*foo$capital_10to20 +
	6*foo$capital_20to50 +
	7*foo$capital_50up +
	8*foo$NA_capital

	# worker feature engineering
	# remove outlier in the control group (10 million workers)
	foo <- foo[-which(foo$worker == max(foo$worker)),]

	foo$worker_0 <- foo$worker == 0
	foo$worker_1 <- foo$worker == 1
	foo$worker_2 <- foo$worker == 2
	foo$worker_3 <- foo$worker == 3
	foo$worker_4 <- foo$worker == 4
	foo$worker_5to9 <- foo$worker >=5 & foo$worker < 10
	foo$worker_10to24 <- foo$worker >=10 & foo$worker < 25
	foo$worker_25to99 <- foo$worker >=25 & foo$worker < 100
	foo$worker_100up <- foo$worker >= 100

	foo$worker_transformed <-
	1*foo$worker_0 +
	2*foo$worker_1 +
	3*foo$worker_2 +
	4*foo$worker_3 +
	5*foo$worker_4 +
	6*foo$worker_5to9 +
	7*foo$worker_10to24 +
	8*foo$worker_25to99 +
	9*foo$worker_100up


	# Treatment Indicator
	foo$treat <- foo$status == "Sudah"

	# The only bank code (var 1) is PEM
	# The only bank name (var 2) is PEMDA -- regional government, not a bank
	# (var 3) 174 outlier ages
	# (var 4) gender is 3 levels: men, women, and business entity ("LAKI-LAKI" "PEREMPUAN" "BADAN USAHA")
	# (var 5) marital_status: business agency, single, married ("BADAN USAHA" "BELUM KAWIN" "KAWIN")

	### (var 6) EDUCATION
	# "BADAN USAHA" "DIPLOMA" "LAINNYA" "SARJANA" "SD" "SMP" "SMU"
	# "BUSINESS AGENCY" "DIPLOMA" "OTHER" "GRADUATE" "Elementary School" "Middle School" "High School"

	### (var 7) OCCUPATION
	#[1] "KARYAWAN SWASTA" "LAIN-LAIN/BADAN USAHA" "NELAYAN" "PEDAGANG"
	#[5] "PENSIUNAN/PURNAWIRAWAN" "PETANI" "PNS" "PROFESIONAL"
	#[9] "TNI/POLRI" "WIRASWASTA"

	#[1] "PRIVATE EMPLOYEES" [2] "OTHERS / BUSINESS AGENCIES" [3] "FISHERMANS" [4] "TRADERS"
	#[5] "PENSIONERS / PURNAWIRAWAN" [6] "FARMERS" [7] "CIVIL SERVANT" [8] "PROFESSIONAL"
	#[9] "ARMY / POLICE" [10] "ENTREPRENEURS"

	### (var 8) 938 unique POSTAL CODES

	### (var 9) 63 unique DISTRICT CODES

	### (var 10) : "worker". unknown meaning... 300+ unique numerical values (maybe no. of staff?)

	### (var 11) (capital)
	# 0% 25% 50% 75% 100%
	# 0.00e+00 1.00e+06 1.00e+07 1.50e+07 1.75e+11

	### (var 12) (credit proposal)
	# 0% 25% 50% 75% 100%
	# 0.0e+00 0.0e+00 1.0e+07 2.5e+07 5.0e+09

	### (var 13) STATUS
	# "Belum" "Sudah"
	# "Not Yet" "Already"

	### (var 14) randomid

	foo_badan <- foo[which(foo$gender == "BADAN USAHA"), ]
	foo_people <- foo[-which(foo$gender == "BADAN USAHA"), ]

	######## CODE MODIFICATION SHOULD BEGIN HERE...
	foo$district_code2 <- substr(foo$district_code,0,2) #taking the first two digits from the district code
	foo$district_code2<-as.numeric(foo$district_code2)

	X = cbind(foo$R_date_of_birth, foo$gender, foo$marital_status,
	foo$education, foo$occupation, foo$district_code2, #swapped out postal code for district code
	foo$worker, foo$capital, foo$credit_proposal,
	foo$worker_transformed, foo$capital_transformed, foo$credit_proposal_transformed,
	foo$missing_date_of_birth,
	foo$NA_postal_code,
	foo$NA_capital,
	foo$NA_credit_proposal)

	##match on date_of_birth within a 1-year (365-day) caliper
	caliper <- 1/(sd(foo$R_date_of_birth)/365) #finding the standard deviation of a year (365 days)

	Tr <- foo$treat

	BalanceMat <- X
	#
	library(rgenoud)
	genout <- GenMatch(Tr=Tr, X=X, BalanceMatrix=BalanceMat, estimand="ATT", M=1,
	pop.size=10, max.generations=2, wait.generations=1,
	#decreased pop.size, max.generations & wait.generations to reduce wait time to 1.5 hours
	caliper = c(caliper,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
	NULL,NULL), #adding caliper only for the date of birth
	exact = c(FALSE, TRUE, TRUE,
	TRUE, TRUE, TRUE,
	FALSE, FALSE, FALSE,
	TRUE, TRUE, TRUE,
	TRUE,
	TRUE,
	TRUE,
	TRUE))
	#


	mout <- Match(Tr=Tr, X=X, estimand="ATT", M=1,
	caliper = c(caliper,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
	NULL,NULL), #adding caliper only for the date of birth
	exact = c(FALSE, TRUE, TRUE,
	TRUE, TRUE, TRUE,
	FALSE, FALSE, FALSE,
	TRUE, TRUE, TRUE,
	TRUE, TRUE, TRUE, TRUE),
	Weight.matrix = genout) #swapped the weight matrix with the new weights
	summary(mout)


	mb <- MatchBalance(foo$treat~
	foo$R_date_of_birth + foo$gender + foo$marital_status +
	foo$education + foo$occupation + foo$district_code2 + #swapped postal code for district code
	foo$worker + foo$capital + foo$credit_proposal +
	foo$worker_transformed + foo$capital_transformed +
	foo$credit_proposal_transformed +
	foo$missing_date_of_birth +
	foo$NA_postal_code +
	foo$NA_capital +
	foo$NA_credit_proposal,
	match.out=mout, nboots=500)
	library(agridat)
	library(Matching)
	library(sensitivitymv)
	library(rbounds)
	foo <- diggle.cow

	#adjusting for NA values
	foo$weight[which(is.na(foo$weight)==TRUE)]<- -999

	#treatment group interpreted as getting both of the randomised treatments
	foo$treat <- foo$infect == 'Infected' & foo$iron=="Iron"

	set.seed(2020)
	glm <- glm(foo$weight ~ foo$treat) #regression with treatment as dependent variable

	X <- glm$fitted
	Y <- foo$weight
	Tr<- foo$treat

	genout <- GenMatch(X=X, Tr=Tr, M=1, estimand ="ATT",
	pop.size = 100, max.generations = 10,
	wait.generations = 1, ties =FALSE) #chose from ties based on order for faster runtime

	mout<- Match(Y=Y, Tr=Tr, X=X, M=1, estimand = 'ATT',
	Weight.matrix = genout, ties =FALSE) #chose from ties based on order for faster runtime

	psens(mout, Gamma=1.5, GammaInc=.1) #sensitivity analysis