Skip to content

Instantly share code, notes, and snippets.

@Aaronmoralesshildrick
Last active October 20, 2018 05:19
Show Gist options
  • Save Aaronmoralesshildrick/3a28d9c80f56087252482511d20609de to your computer and use it in GitHub Desktop.
Save Aaronmoralesshildrick/3a28d9c80f56087252482511d20609de to your computer and use it in GitHub Desktop.
Regression and Bootstrapping CS 112 Assignment 2
#Question 1
set.seed(123)
#Creation of data
X <- c(runif(99, min = 0, max = 12))
Y = (study_time^2 * 5) + rnorm(99, mean = 0, sd = 50)
mydata = data.frame(X, Y)
#Creation of the models and plots (also calls for summary function are here)
plot(Y~X, main = "How Outliers Affect Regression")
lm1 = lm(Y~X, data = mydata)
abline(lm1, col = "blue")
abline(lm2, col = "red")
summary(lm1)
#Plot and data with the extra observation
outlierdata = rbind(mydata, c(0, 20000))
plot(Y~X, data = outlierdata, main = "Full Data with Outlier Position")
lm2 = lm(Y~X, data = outlierdata)
abline(lm2)
summary(lm2)
#Question 2
excluded <- which(lalonde$treat == 1)
lalonde.control <- lalonde[-excluded, ] #Only control group patients
lm_control = lm(re78 ~ age + educ + re74 +re75 +
educ*re74 + educ*re75 + age*re74 + age*re75 + re74*re75,
data = lalonde.control) #Linear model with the specified predictors
lm_control$coef
#intercept = 3685.51, age = 2.21, educ = 39.06, re74 = -0.015, re75 = 0.784,
#educ:re74 = 0.0344, educ:re75 = -0.072, age:re74 = -0.0057, age:re75 = 0.0093,
#re74:re75 = -0.000022
set.seed(123)
sim_results <- sim(lm_control, n.sims = 10000)
mean(sim_results@coef[,10]) #try this with all numbers for predictors gives following data
# From sim From lm_control
#Intercept 3645.201 3685.51
#age 2.234 2.21
#educ 43.319 39.06
#re74 -0.023 -0.015
#re75 0.832 0.784
#educ:re74 0.033 0.0344
#educ:re75 -0.074 -0.072
#age:re74 -0.0051 -0.0057
#age:re75 0.008 0.0093
#re74:re75 -0.0000229 -0.000022
sigmas <- c(sim_results@sigma)
coefs <- c(sim_results@coef)
#find the medians
med.edu = median(lalonde.control$educ)
med.re74 = median(lalonde.control$re74)
med.re75 = median(lalonde.control$re75)
ages = c(min(lalonde.control$age): max(lalonde.control$age))
set.seed(1)
stored = data.frame() #Create an empty data frame
#Nested for loop that keeps valued at their medians and only age varies.
#Includes sigmas
for(j in ages){
unit = c(1, j, med.edu, 0, 0, 0, 0, 0, 0, 0)
for(i in 1:10000){
stored[i, j - 16] <-
sum(unit * sim_results@coef[i,] )+ rnorm(1, 0, sim_results@sigma[i])
}
}
stored
confints = apply(stored, 2, quantile, probs = c(0.025, 0.975)) #find confidence interval per age
confints2
#plot the graph
plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-20000,30000),
main = "Confidence intervals per age with variables at their medians", xlab = "Age",
ylab = "Real earnings in 1978")
for (age in ages) {
segments(
x0 = age,
y0 = confints[1, age - 16],
x1 = age,
y1 = confints[2, age - 16],
lwd = 2)
}
#90 Quantile problem
#set values at their 90th quantile
edu.90 = quantile(lalonde.control$educ, probs = c(0.9))
re74.90 = quantile(lalonde.control$re74, probs = c(0.9))
re75.90 = quantile(lalonde.control$re75, probs = c(0.9))
set.seed(1)
stored2 = data.frame()
#repeat process with new values and data frame.
for(j in ages){
unit = c(1, j, edu.90, re74.90, re75.90, j*re74.90, j*re75.90, edu.90*re74.90, edu.90*re75.90, re74.90*re75.90)
for(i in 1:10000){
stored2[i, j - 16] <-
sum(unit * sim_results@coef[i,] )+ rnorm(1, 0, sim_results@sigma[i])
}
}
head(stored2)
confints2 = apply(stored2, 2, quantile, probs = c(0.025, 0.975))
plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-70000,70000),
main = "Confidence intervals per age with variables at their 90th quantile", xlab = "Age",
ylab = "Real earnings in 1978")
for (age in ages) {
segments(
x0 = age,
y0 = confints2[1, age - 16],
x1 = age,
y1 = confints2[2, age - 16],
lwd = 2)
}
#Question 3
library("foreign")
nsw.dta <- read.dta("/Users/aaron/Documents/Minerva/CS\ 112/Assignments/Assignment\ 2/nsw.dta")
nsw.dta
set.seed(123)
lmnsw = lm(re78~treat, data = nsw.dta) #linear model
storage <- c(rep (0, 10000)) #empty vector
#bootstrapp and create a model for each repetition. Store the coeficients in the vector
for(i in 1:10000){
index <- sample(1:nrow(nsw.dta), nrow(nsw.dta), replace = TRUE)
boot.sample = nsw.dta[index, ]
bootmodel = lm(re78~treat, data = boot.sample)
storage[i] = bootmodel$coefficients[2]
}
quant = quantile(storage, probs = c(0.025, 0.975))
quant
confint(lmnsw) #compare with the function created above
hist(storage, xlab = "Value of coefficients", main = "Histogram for bootstrapped coefficients",
col = "steelblue4")
#Question 4
r.sqr <- function(y, y1){
SE <- sum((y1 - y) ^ 2) ## Squared error
ME <- sum((y - mean(y)) ^ 2) ## error from mean
rsq <- 1 - SE/ME
return(rsq)
}
testmodel <- lm(re75~education * age + age + education, data = nsw.dta)
preds <- predict(testmodel)
r.sqr(nsw.dta$re75, preds)
#Question 5
nsw <- read.dta("/Users/aaron/Documents/Minerva/CS\ 112/Assignments/Assignment\ 2/nsw.dta")
nsw <- nsw[, -1]
model1 <- lm(treat~.-re78, data = nsw, family = "binomial")
summary(model1)
pred <- predict(model1, type = "response")
proba <- data.frame(nsw$treat, pred)
head(proba)
treat <- subset(proba, proba$nsw.treat == 1)
control <- subset(proba, proba$nsw.treat == 0)
head(treat)
head(control)
hist(treat$pred, ylim = c(0,125), xlim = c(0.3, 0.6), col = "firebrick", xlab = "Probabilities",
main = "Histogram for probabilities in the treatment group")
hist(control$pred, ylim = c(0,200), xlim = c(0.3, 0.55), col = "dodgerblue4",
main = "Histogram for probabilities in the control group")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment