Skip to content

Instantly share code, notes, and snippets.

@pattoM
Last active February 20, 2018 13:27
Show Gist options
  • Save pattoM/9054744765c747c3453bb33c72b117ec to your computer and use it in GitHub Desktop.
Save pattoM/9054744765c747c3453bb33c72b117ec to your computer and use it in GitHub Desktop.
Code for assignment - Lalonde 3 ways
#Load library and data
library("Matching")
data("lalonde")
#Separate data into subsets based on the degree status
nodegree <- subset(lalonde, nodegr == 1)
degree <- subset(lalonde, nodegr == 0)
#Make scatterplot of the data
plot(re78 ~ treat+educ+age+re74+re75,data=nodegree, xlab="Variables", ylab="Real earnings 1978", main="Linear regression for without degree")
#abline(nodegreemod)
#Linear model fitting
nodegreemod <- lm(formula = re78 ~ treat+educ+age+re74+re75,data=nodegree)
summary(nodegreemod)
#Estimating confidence intervals
confint(nodegreemod)
#Part 2 of the problem
#Make scatterplot of the data
plot(re78 ~ treat+educ+age+re74+re75,data=degree, xlab="Variables", ylab="Real earnings 1978", main="Linear regression for with degree")
#abline(nodegreemod)
#Linear model fitting
degreemod <- lm(formula = re78 ~ treat+educ+age+re74+re75,data=degree)
summary(degreemod)
#Estimating confidence intervals
confint(degreemod)
#QUESTION 2
#Create the interaction variable
#Centering
#nodegrc <- nodegr-mean(nodegr)
#treatc <- treat-mean(treat)
#creating actual interaction term by multiplication
#degtreat <- nodegrc*treatc
#Finalize the model with the interaction term
interactionmodel <- lm(formula = re78 ~ treat+educ+age+re74+re75+I(nodegr*treat),data=lalonde)
summary(interactionmodel)
#Confidence interval
confint(interactionmodel, level=0.95)
#Sim function to simulate uncertainty
interactionmodelsim <- sim(interactionmodel)
coef(interactionmodelsim)
#mean of the columns
meanofcoeff <- apply(coef(interactionmodelsim), 2, mean) #Means of the simulated coefficients
sdofcoeff <- apply(coef(interactionmodelsim), 2, sd) # Standard deviations of simulated coefficients
#95 percent confidence interval is 1.96SD from the mean therefore:
low.bound <- meanofcoeff - 1.96*sdofcoeff
low.bound
upper.bound <- meanofcoeff + 1.96*sdofcoeff
upper.bound
#Data visualization
#install.packages("sjPlot")
library(sjPlot)
sjp.lm(interactionmodel, show.loess.ci = T, show.values = T, show.summary = T)
sjp.int(interactionmodel, show.values = T, show.ci = T)
#QUESTION 3
#Since we will be running a logistic regression, we need re78 to be binary therefore
lalonde$u78[lalonde$re78<=0] <- 0
lalonde$u78[lalonde$re78>0] <- 1
#Separating data into subsets based on degree status
logistic.nodegree <- subset(lalonde, nodegr == 1)
logistic.degree <- subset(lalonde, nodegr == 0)
#Running logistic regression when nodegree = true
logisticmod1 <- glm(u78~treat+educ+age+re74+re75, data = logistic.nodegree, family = binomial)
summary(logisticmod1)
confint(logisticmod1)
#Logistic regression where nodegree = false
logisticmod2 <- glm(u78~treat+educ+age+re74+re75, data = logistic.degree, family = binomial)
summary(logisticmod2)
confint(logisticmod2)
#PROBLEM 4
#Guidance From https://datascienceplus.com/random-forests-in-r/
require(randomForest)
set.seed(1)
#check dimensions for lalonde
dim(lalonde)
#The random forest will incorporate all predictors
#Separating training and test sets
trainset=sample(1:nrow(lalonde),300, replace = TRUE)
lalonde.rf=randomForest(re78 ~ . , data = lalonde , subset = trainset, importance=TRUE)
lalonde.rf
plot(lalonde.rf)
#Comparing out of bag sample errors and test set errors
oob.err=double(12) #because the dimensions for there are 12 predictors and 1 dependent var in lalonde
test.err=double(12)
#mtry is no of Variables randomly chosen at each split. We hope to see no with least error
for(mtry in 1:12)
{
rf=randomForest(re78 ~ . , data = lalonde , subset = trainset, mtry = mtry, ntree=300)
oob.err[mtry] = rf$mse[300] #Error of all Trees fitted
pred<-predict(rf,lalonde[-trainset,]) #Predictions on Test Set for each Tree
test.err[mtry]= with(lalonde[-trainset,], mean( (re78 - pred)^2)) #Mean Squared Test Error
cat(mtry," ") #printing the output to the console
}
test.err
oob.err
matplot(1:mtry , cbind(oob.err,test.err), pch=19 , col=c("red","blue"),type="b",ylab="Mean Squared Error",xlab="Number of Predictors Considered at each Split")
legend("topright",legend=c("Out of Bag Error","Test Error"),pch=19, col=c("red","blue"))
varImpPlot(lalonde.rf, main = "Trained forest Variable Importance Plot")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment