Last active
October 20, 2018 05:19
-
-
Save Aaronmoralesshildrick/3a28d9c80f56087252482511d20609de to your computer and use it in GitHub Desktop.
Regression and Bootstrapping CS 112 Assignment 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Question 1 | |
set.seed(123) | |
#Creation of data | |
X <- c(runif(99, min = 0, max = 12)) | |
Y = (study_time^2 * 5) + rnorm(99, mean = 0, sd = 50) | |
mydata = data.frame(X, Y) | |
#Creation of the models and plots (also calls for summary function are here) | |
plot(Y~X, main = "How Outliers Affect Regression") | |
lm1 = lm(Y~X, data = mydata) | |
abline(lm1, col = "blue") | |
abline(lm2, col = "red") | |
summary(lm1) | |
#Plot and data with the extra observation | |
outlierdata = rbind(mydata, c(0, 20000)) | |
plot(Y~X, data = outlierdata, main = "Full Data with Outlier Position") | |
lm2 = lm(Y~X, data = outlierdata) | |
abline(lm2) | |
summary(lm2) | |
#Question 2 | |
excluded <- which(lalonde$treat == 1) | |
lalonde.control <- lalonde[-excluded, ] #Only control group patients | |
lm_control = lm(re78 ~ age + educ + re74 +re75 + | |
educ*re74 + educ*re75 + age*re74 + age*re75 + re74*re75, | |
data = lalonde.control) #Linear model with the specified predictors | |
lm_control$coef | |
#intercept = 3685.51, age = 2.21, educ = 39.06, re74 = -0.015, re75 = 0.784, | |
#educ:re74 = 0.0344, educ:re75 = -0.072, age:re74 = -0.0057, age:re75 = 0.0093, | |
#re74:re75 = -0.000022 | |
set.seed(123) | |
sim_results <- sim(lm_control, n.sims = 10000) | |
mean(sim_results@coef[,10]) #try this with all numbers for predictors gives following data | |
# From sim From lm_control | |
#Intercept 3645.201 3685.51 | |
#age 2.234 2.21 | |
#educ 43.319 39.06 | |
#re74 -0.023 -0.015 | |
#re75 0.832 0.784 | |
#educ:re74 0.033 0.0344 | |
#educ:re75 -0.074 -0.072 | |
#age:re74 -0.0051 -0.0057 | |
#age:re75 0.008 0.0093 | |
#re74:re75 -0.0000229 -0.000022 | |
sigmas <- c(sim_results@sigma) | |
coefs <- c(sim_results@coef) | |
#find the medians | |
med.edu = median(lalonde.control$educ) | |
med.re74 = median(lalonde.control$re74) | |
med.re75 = median(lalonde.control$re75) | |
ages = c(min(lalonde.control$age): max(lalonde.control$age)) | |
set.seed(1) | |
stored = data.frame() #Create an empty data frame | |
#Nested for loop that keeps valued at their medians and only age varies. | |
#Includes sigmas | |
for(j in ages){ | |
unit = c(1, j, med.edu, 0, 0, 0, 0, 0, 0, 0) | |
for(i in 1:10000){ | |
stored[i, j - 16] <- | |
sum(unit * sim_results@coef[i,] )+ rnorm(1, 0, sim_results@sigma[i]) | |
} | |
} | |
stored | |
confints = apply(stored, 2, quantile, probs = c(0.025, 0.975)) #find confidence interval per age | |
confints2 | |
#plot the graph | |
plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-20000,30000), | |
main = "Confidence intervals per age with variables at their medians", xlab = "Age", | |
ylab = "Real earnings in 1978") | |
for (age in ages) { | |
segments( | |
x0 = age, | |
y0 = confints[1, age - 16], | |
x1 = age, | |
y1 = confints[2, age - 16], | |
lwd = 2) | |
} | |
#90 Quantile problem | |
#set values at their 90th quantile | |
edu.90 = quantile(lalonde.control$educ, probs = c(0.9)) | |
re74.90 = quantile(lalonde.control$re74, probs = c(0.9)) | |
re75.90 = quantile(lalonde.control$re75, probs = c(0.9)) | |
set.seed(1) | |
stored2 = data.frame() | |
#repeat process with new values and data frame. | |
for(j in ages){ | |
unit = c(1, j, edu.90, re74.90, re75.90, j*re74.90, j*re75.90, edu.90*re74.90, edu.90*re75.90, re74.90*re75.90) | |
for(i in 1:10000){ | |
stored2[i, j - 16] <- | |
sum(unit * sim_results@coef[i,] )+ rnorm(1, 0, sim_results@sigma[i]) | |
} | |
} | |
head(stored2) | |
confints2 = apply(stored2, 2, quantile, probs = c(0.025, 0.975)) | |
plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-70000,70000), | |
main = "Confidence intervals per age with variables at their 90th quantile", xlab = "Age", | |
ylab = "Real earnings in 1978") | |
for (age in ages) { | |
segments( | |
x0 = age, | |
y0 = confints2[1, age - 16], | |
x1 = age, | |
y1 = confints2[2, age - 16], | |
lwd = 2) | |
} | |
#Question 3 | |
library("foreign") | |
nsw.dta <- read.dta("/Users/aaron/Documents/Minerva/CS\ 112/Assignments/Assignment\ 2/nsw.dta") | |
nsw.dta | |
set.seed(123) | |
lmnsw = lm(re78~treat, data = nsw.dta) #linear model | |
storage <- c(rep (0, 10000)) #empty vector | |
#bootstrapp and create a model for each repetition. Store the coeficients in the vector | |
for(i in 1:10000){ | |
index <- sample(1:nrow(nsw.dta), nrow(nsw.dta), replace = TRUE) | |
boot.sample = nsw.dta[index, ] | |
bootmodel = lm(re78~treat, data = boot.sample) | |
storage[i] = bootmodel$coefficients[2] | |
} | |
quant = quantile(storage, probs = c(0.025, 0.975)) | |
quant | |
confint(lmnsw) #compare with the function created above | |
hist(storage, xlab = "Value of coefficients", main = "Histogram for bootstrapped coefficients", | |
col = "steelblue4") | |
#Question 4 | |
r.sqr <- function(y, y1){ | |
SE <- sum((y1 - y) ^ 2) ## Squared error | |
ME <- sum((y - mean(y)) ^ 2) ## error from mean | |
rsq <- 1 - SE/ME | |
return(rsq) | |
} | |
testmodel <- lm(re75~education * age + age + education, data = nsw.dta) | |
preds <- predict(testmodel) | |
r.sqr(nsw.dta$re75, preds) | |
#Question 5 | |
nsw <- read.dta("/Users/aaron/Documents/Minerva/CS\ 112/Assignments/Assignment\ 2/nsw.dta") | |
nsw <- nsw[, -1] | |
model1 <- lm(treat~.-re78, data = nsw, family = "binomial") | |
summary(model1) | |
pred <- predict(model1, type = "response") | |
proba <- data.frame(nsw$treat, pred) | |
head(proba) | |
treat <- subset(proba, proba$nsw.treat == 1) | |
control <- subset(proba, proba$nsw.treat == 0) | |
head(treat) | |
head(control) | |
hist(treat$pred, ylim = c(0,125), xlim = c(0.3, 0.6), col = "firebrick", xlab = "Probabilities", | |
main = "Histogram for probabilities in the treatment group") | |
hist(control$pred, ylim = c(0,200), xlim = c(0.3, 0.55), col = "dodgerblue4", | |
main = "Histogram for probabilities in the control group") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment