pattoM/Lalonde3ways.R

## Lalonde3ways.R
#Load library and data
library("Matching")
data("lalonde")

#Separate data into subsets based on the degree status
nodegree <- subset(lalonde, nodegr == 1)
degree <- subset(lalonde, nodegr == 0)

#Make scatterplot of the data
plot(re78 ~ treat+educ+age+re74+re75,data=nodegree, xlab="Variables", ylab="Real earnings 1978", main="Linear regression for without degree")
#abline(nodegreemod)
#Linear model fitting
nodegreemod <- lm(formula = re78 ~ treat+educ+age+re74+re75,data=nodegree)
summary(nodegreemod)
#Estimating confidence intervals
confint(nodegreemod)

#Part 2 of the problem
#Make scatterplot of the data
plot(re78 ~ treat+educ+age+re74+re75,data=degree, xlab="Variables", ylab="Real earnings 1978", main="Linear regression for with degree")
#abline(nodegreemod)
#Linear model fitting
degreemod <- lm(formula = re78 ~ treat+educ+age+re74+re75,data=degree)
summary(degreemod)
#Estimating confidence intervals
confint(degreemod)


#QUESTION 2
#Create the interaction variable
#Centering
#nodegrc <- nodegr-mean(nodegr)

#treatc <- treat-mean(treat)
#creating actual interaction term by multiplication
#degtreat <- nodegrc*treatc
#Finalize the model with the interaction term
interactionmodel <- lm(formula = re78 ~ treat+educ+age+re74+re75+I(nodegr*treat),data=lalonde)
summary(interactionmodel)
#Confidence interval
confint(interactionmodel, level=0.95)
#Sim function to simulate uncertainty
interactionmodelsim <- sim(interactionmodel)
coef(interactionmodelsim)

#mean of the columns
meanofcoeff <- apply(coef(interactionmodelsim), 2, mean)  #Means of the simulated coefficients
sdofcoeff <- apply(coef(interactionmodelsim), 2, sd) # Standard deviations of simulated coefficients

#95 percent confidence interval is 1.96SD from the mean therefore:
low.bound <- meanofcoeff - 1.96*sdofcoeff
low.bound

upper.bound <- meanofcoeff + 1.96*sdofcoeff
upper.bound

#Data visualization
#install.packages("sjPlot")
library(sjPlot)
sjp.lm(interactionmodel, show.loess.ci = T, show.values = T, show.summary = T)
sjp.int(interactionmodel, show.values = T, show.ci = T)


#QUESTION 3
#Since we will be running a logistic regression, we need re78 to be binary therefore
lalonde$u78[lalonde$re78<=0] <- 0
lalonde$u78[lalonde$re78>0] <- 1

#Separating data into subsets based on degree status
logistic.nodegree <- subset(lalonde, nodegr == 1)
logistic.degree <- subset(lalonde, nodegr == 0)

#Running logistic regression when nodegree = true
logisticmod1 <- glm(u78~treat+educ+age+re74+re75, data = logistic.nodegree, family = binomial)
summary(logisticmod1)
confint(logisticmod1)

#Logistic regression where nodegree = false
logisticmod2 <- glm(u78~treat+educ+age+re74+re75, data = logistic.degree, family = binomial)
summary(logisticmod2)
confint(logisticmod2)


#PROBLEM 4
#Guidance From https://datascienceplus.com/random-forests-in-r/

require(randomForest)
set.seed(1)
#check dimensions for lalonde
dim(lalonde)

#The random forest will incorporate all predictors

#Separating training and test sets
trainset=sample(1:nrow(lalonde),300, replace = TRUE)
lalonde.rf=randomForest(re78 ~ . , data = lalonde , subset = trainset, importance=TRUE)
lalonde.rf

plot(lalonde.rf)

#Comparing out of bag sample errors and test set errors
oob.err=double(12) #because the dimensions for there are 12 predictors and 1 dependent var in lalonde
test.err=double(12)

#mtry is no of Variables randomly chosen at each split. We hope to see no with least error
for(mtry in 1:12)
{
  rf=randomForest(re78 ~ . , data = lalonde , subset = trainset, mtry = mtry, ntree=300)
  oob.err[mtry] = rf$mse[300] #Error of all Trees fitted

  pred<-predict(rf,lalonde[-trainset,]) #Predictions on Test Set for each Tree
  test.err[mtry]= with(lalonde[-trainset,], mean( (re78 - pred)^2)) #Mean Squared Test Error

  cat(mtry," ") #printing the output to the console

}

test.err
oob.err

matplot(1:mtry , cbind(oob.err,test.err), pch=19 , col=c("red","blue"),type="b",ylab="Mean Squared Error",xlab="Number of Predictors Considered at each Split")
legend("topright",legend=c("Out of Bag Error","Test Error"),pch=19, col=c("red","blue"))

varImpPlot(lalonde.rf, main = "Trained forest Variable Importance Plot")
	#Load library and data
	library("Matching")
	data("lalonde")

	#Separate data into subsets based on the degree status
	nodegree <- subset(lalonde, nodegr == 1)
	degree <- subset(lalonde, nodegr == 0)

	#Make scatterplot of the data
	plot(re78 ~ treat+educ+age+re74+re75,data=nodegree, xlab="Variables", ylab="Real earnings 1978", main="Linear regression for without degree")
	#abline(nodegreemod)
	#Linear model fitting
	nodegreemod <- lm(formula = re78 ~ treat+educ+age+re74+re75,data=nodegree)
	summary(nodegreemod)
	#Estimating confidence intervals
	confint(nodegreemod)

	#Part 2 of the problem
	#Make scatterplot of the data
	plot(re78 ~ treat+educ+age+re74+re75,data=degree, xlab="Variables", ylab="Real earnings 1978", main="Linear regression for with degree")
	#abline(nodegreemod)
	#Linear model fitting
	degreemod <- lm(formula = re78 ~ treat+educ+age+re74+re75,data=degree)
	summary(degreemod)
	#Estimating confidence intervals
	confint(degreemod)


	#QUESTION 2
	#Create the interaction variable
	#Centering
	#nodegrc <- nodegr-mean(nodegr)

	#treatc <- treat-mean(treat)
	#creating actual interaction term by multiplication
	#degtreat <- nodegrc*treatc
	#Finalize the model with the interaction term
	interactionmodel <- lm(formula = re78 ~ treat+educ+age+re74+re75+I(nodegr*treat),data=lalonde)
	summary(interactionmodel)
	#Confidence interval
	confint(interactionmodel, level=0.95)
	#Sim function to simulate uncertainty
	interactionmodelsim <- sim(interactionmodel)
	coef(interactionmodelsim)

	#mean of the columns
	meanofcoeff <- apply(coef(interactionmodelsim), 2, mean) #Means of the simulated coefficients
	sdofcoeff <- apply(coef(interactionmodelsim), 2, sd) # Standard deviations of simulated coefficients

	#95 percent confidence interval is 1.96SD from the mean therefore:
	low.bound <- meanofcoeff - 1.96*sdofcoeff
	low.bound

	upper.bound <- meanofcoeff + 1.96*sdofcoeff
	upper.bound

	#Data visualization
	#install.packages("sjPlot")
	library(sjPlot)
	sjp.lm(interactionmodel, show.loess.ci = T, show.values = T, show.summary = T)
	sjp.int(interactionmodel, show.values = T, show.ci = T)


	#QUESTION 3
	#Since we will be running a logistic regression, we need re78 to be binary therefore
	lalonde$u78[lalonde$re78<=0] <- 0
	lalonde$u78[lalonde$re78>0] <- 1

	#Separating data into subsets based on degree status
	logistic.nodegree <- subset(lalonde, nodegr == 1)
	logistic.degree <- subset(lalonde, nodegr == 0)

	#Running logistic regression when nodegree = true
	logisticmod1 <- glm(u78~treat+educ+age+re74+re75, data = logistic.nodegree, family = binomial)
	summary(logisticmod1)
	confint(logisticmod1)

	#Logistic regression where nodegree = false
	logisticmod2 <- glm(u78~treat+educ+age+re74+re75, data = logistic.degree, family = binomial)
	summary(logisticmod2)
	confint(logisticmod2)



	#PROBLEM 4
	#Guidance From https://datascienceplus.com/random-forests-in-r/

	require(randomForest)
	set.seed(1)
	#check dimensions for lalonde
	dim(lalonde)

	#The random forest will incorporate all predictors

	#Separating training and test sets
	trainset=sample(1:nrow(lalonde),300, replace = TRUE)
	lalonde.rf=randomForest(re78 ~ . , data = lalonde , subset = trainset, importance=TRUE)
	lalonde.rf

	plot(lalonde.rf)

	#Comparing out of bag sample errors and test set errors
	oob.err=double(12) #because the dimensions for there are 12 predictors and 1 dependent var in lalonde
	test.err=double(12)

	#mtry is no of Variables randomly chosen at each split. We hope to see no with least error
	for(mtry in 1:12)
	{
	rf=randomForest(re78 ~ . , data = lalonde , subset = trainset, mtry = mtry, ntree=300)
	oob.err[mtry] = rf$mse[300] #Error of all Trees fitted

	pred<-predict(rf,lalonde[-trainset,]) #Predictions on Test Set for each Tree
	test.err[mtry]= with(lalonde[-trainset,], mean( (re78 - pred)^2)) #Mean Squared Test Error

	cat(mtry," ") #printing the output to the console

	}

	test.err
	oob.err

	matplot(1:mtry , cbind(oob.err,test.err), pch=19 , col=c("red","blue"),type="b",ylab="Mean Squared Error",xlab="Number of Predictors Considered at each Split")
	legend("topright",legend=c("Out of Bag Error","Test Error"),pch=19, col=c("red","blue"))

	varImpPlot(lalonde.rf, main = "Trained forest Variable Importance Plot")