Last active
September 26, 2020 03:56
-
-
Save Anna-Nevm/90a0c7bdd4a70b955b7e5586b32ac5be to your computer and use it in GitHub Desktop.
Classic Univariate Regression and Bootstrapping (Lalonde dataset)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#_________________________________1_________________________________ | |
#generating 998 observations for X from uniform distribution | |
x <- runif(998, min = 0, max=10) | |
# data-generating equation for y = 15 - 0.25x + e; | |
y = 15 - 0.055 * x + rnorm(998, 0.5, 0.2) | |
#e=rnorm(998, 0.5, 0.2) - irreducible error drawn from normal distribution, should average to 0.5 with 0.2 SD | |
#creating data frame with predicted values of y for each x | |
df_no_outliers <- data.frame(x, y) | |
#creating simple univariate regression | |
regression <- lm(y ~ x, data = df_no_outliers) | |
summary(regression) | |
#getting regression coefficients to later use for plotting the regression line | |
coef(regression) | |
intercept <- coef(regression)[1] | |
slope <-coef(regression)[2] | |
#creating a new dataset with two outliers | |
df_with_outliers <- rbind(df_no_outliers, c(0.5,-50), c(0.3,-38)) | |
#these really small values of y would pull the regression line down to the left and change the value of the slope to positive | |
#creating simple regression for the dataset with outliers | |
regression_with_outliers <- lm(y ~ x, data = df_with_outliers) | |
summary(regression_with_outliers) | |
coef(regression) | |
coef(regression_with_outliers) | |
#plotting the data with regression lines for regression on both datasets | |
intercept_outliers <-coef(regression_with_outliers)[1] | |
slope_outliers <-coef(regression_with_outliers)[2] | |
plot(df_no_outliers, | |
xlab = "x", | |
ylab = "y", | |
pch=21, cex = .4, col=c("peachpuff2")) | |
abline(a = intercept, | |
b = slope, | |
col = "deepskyblue3", | |
lwd=2) | |
abline(a = intercept_outliers, | |
b = slope_outliers, | |
col = "brown1",lwd=2) | |
#________________________________2_________________________________ | |
library(Matching) | |
data(lalonde) | |
#adding two independent variables from the assignment prompt - age^2 and treat*age- to the lalonde dataset | |
lalonde$age_squared <- lalonde$age^2 | |
lalonde$t_a <- lalonde$age*lalonde$treat | |
#checking whether new columns were added | |
colnames(lalonde) | |
#creating a linear model that predicts re78 as a linear additive function of age, age^2, educ, treat, treat*age, re74, and re75 | |
regression <- lm(lalonde$re78 ~ lalonde$age + lalonde$age_squared + lalonde$educ + lalonde$treat + lalonde$t_a + lalonde$re74 + lalonde$re75) | |
summary(regression) | |
coef(regression) | |
#creating a simulation based on regression model for the whole lalonde dataset | |
library('arm') | |
iterations <- 10000 | |
simulation <- sim(regression, n.sims=iterations) | |
#storing min and max age values for easier reference (17 and 55) | |
min_age <- min(lalonde$age) | |
max_age <- max(lalonde$age) | |
#___________QUESTION 2-a________ | |
#the 95% interval of expected values for re78, for every TREATED UNIT | |
#creating a dataset that only contains data for treated units | |
lalonde_treated <- lalonde[which(lalonde$treat == 1), ] | |
#creating a matrix populated with NAs that would be later used to store expected values | |
simulated_EV_treated <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age)) #age range: 17-55 | |
#creating variables with the values of means of educ, re74 and re75 to further use them for holding these predictors constant during the simulation of expected values | |
mean_educ_treated<- mean(lalonde_treated$educ) | |
mean_re74_treated <- mean(lalonde_treated$re74) | |
mean_re75_treated <- mean(lalonde_treated$re75) | |
mean_educ_treated | |
mean_re74_treated | |
mean_re75_treated | |
#checking the order of the coefficients in the simulation so it can be matched when populating expected values | |
colnames(simulation_treated@coef) | |
#generating expected values | |
for (age in min_age:max_age) { | |
#1 is the intercept which has to be kept at the same value as in simulation | |
#lalonde_treated$treat = 1 | |
predictors_treated <- c(1, age, age^2, mean_educ_treated, 1, age*1, mean_re74_treated, mean_re75_treated) | |
#populating the matrix with expected values | |
for (i in 1:iterations) { | |
#i - row, [17+1-minimum age] - proper index for the column; multiplying all simulated coefficients by their corresponding predictors | |
simulated_EV_treated[i, age + 1 - min_age] <- sum(predictors_treated*simulation@coef[i,]) | |
} | |
} | |
#predictors_treated | |
#simulated_EV_treated | |
simulated_EV_treated | |
#calculating 95% confidence intervals | |
CI_mean_treated <- apply(simulated_EV_treated, 2, quantile, probs = c(0.025, 0.975)) | |
#putting them in a data frame for a better display | |
table_treated_mean <- t(data.frame(CI_mean_treated)) | |
#adding mean expected value for each age | |
Mean_treated <- colMeans(simulated_EV_treated) | |
treated_table <- cbind("Mean expected value"= Mean_treated, table_treated_mean) | |
treated_table | |
#plotting the prediction intervals for each age | |
plot(x = c(1:100), y = c(1:100), type = "n", | |
xlim = c(min_age,max_age), | |
ylim = c(-20000,20000), | |
main = "Expected Value of Real Earnings in 1978 by Age with Predictors Held at the Mean (treatment group)", xlab = "Age", | |
ylab = "Expected value of Re78") | |
for (age in min_age:max_age) { | |
segments (x0 = age, y0 = CI_mean_treated[1, age - min_age + 1], x1 = age, y1 = CI_mean_treated[2, age - min_age + 1],lwd = 2) | |
} | |
#___________QUESTION 2-b________ | |
# the 95% interval of expected values for re78, for every CONTROL unit | |
#repeating the above for control units | |
lalonde_control<- lalonde[which(lalonde$treat == 0), ] | |
#empty matrix for storing expected values for control units | |
simulated_EV_control <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age)) | |
#creating variables with the values of means of educ, re74 and r75 to further use them for holding these predictors constant during the simulation of expected values | |
mean_educ_control<- mean(lalonde_control$educ) | |
mean_re74_control <- mean(lalonde_control$re74) | |
mean_re75_control <- mean(lalonde_control$re75) | |
mean_educ_control | |
mean_re74_control | |
mean_re75_control | |
for (age in min_age:max_age) { | |
#lalonde_control$treat is 0 | |
predictors_control <- c(1, age, age^2, mean_educ, 0, age*0, mean_re74_control, mean_re75_control) | |
for (i in 1:iterations) { | |
simulated_EV_control[i, age + 1 - min_age] <- sum(predictors_control*simulation@coef[i,]) | |
} | |
} | |
simulated_EV_control | |
CI_mean_control <- apply(simulated_EV_control, 2, quantile, probs = c(0.025, 0.975)) | |
table_control_mean <- t(data.frame(CI_mean_control)) | |
Mean_control<- colMeans(simulated_EV_control) | |
control_table <- cbind("Mean expected value"=Mean_control, table_control_mean) | |
control_table | |
plot(x = c(1:100), y = c(1:100), type = "n", | |
xlim = c(min(lalonde_ctrl$age),max(lalonde_ctrl$age)), | |
ylim = c(-10000,20000), | |
#think these through | |
main = "Expected Value of Real Earnings in 1978 by Age with Predictors Held at the Mean (control group)", xlab = "Age", | |
ylab = "Expected value of Re78") | |
for (age in min_age:max_age) { | |
segments (x0 = age, y0 = CI_mean_control[1, age - min_age + 1], x1 = age, y1 = CI_mean_control[2, age - min_age + 1],lwd = 2) | |
} | |
#___________QUESTION 2-c________ | |
#the 95% interval of expected values for the treatment effect | |
simulated_EV_treated | |
simulated_EV_control | |
#calculating the treatment effect and corresponding confidence intervals | |
treatment_effect <- simulated_EV_treated - simulated_EV_control | |
CI_treatment_effect <- apply(treatment_effect, 2, quantile, probs = c(0.025, 0.975)) | |
#storing results in a tables and adding mean expected values for treatment effect for each age | |
treatment_effect_table_1 <- t(data.frame(CI_treatment_effect)) | |
Mean_treatment_effect <- colMeans(CI_treatment_effect) | |
treatment_effect_table <- cbind("Mean expected treatment effect" =Mean_treatment_effect, treatment_effect_table_1) | |
treatment_effect_table | |
plot(x = c(1:100), y = c(1:100), type = "n", | |
xlim = c(min_age,max_age), | |
ylim = c(-10000,20000), | |
main = " Treatment effect by Age", xlab = "Age", | |
ylab = "Treatment effect") | |
for (age in min_age:max_age) { | |
segments (x0 = age, y0 = CI_treatment_effect[1, age - min_age + 1], x1 = age, y1 = CI_treatment_effect[2, age - min_age + 1],lwd = 2) | |
} | |
#___________QUESTION 2-d________ | |
#using analogical code to 2a-2c to calculate the 95% prediction interval for the treatment effect holding predictors at the median | |
median_EV_treated <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age)) | |
median_EV_control <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age)) | |
median_educ_treated<- median(lalonde_treated$educ) | |
median_re74_treated <- median(lalonde_treated$re74) | |
median_re75_treated <- median(lalonde_treated$re75) | |
median_educ_control<- median(lalonde_control$educ) | |
median_re74_control <- median(lalonde_control$re74) | |
median_re75_control <- median(lalonde_control$re75) | |
median_educ_treated | |
median_re74_treated | |
median_re75_treated | |
median_educ_control | |
median_re74_control | |
median_re75_control | |
for (age in min_age:max_age) { | |
predictors_treated_median <- c(1, age, age^2, median_educ_treated, 1, age*1, median_re74_treated, median_re75_treated) | |
predictors_control_median <- c(1, age, age^2, median_educ_control, 0, age*0, median_re74_control, median_re75_control) | |
for (i in 1:iterations) { | |
median_EV_treated[i, age + 1 - min_age] <- sum(predictors_treated_median*simulation@coef[i,]) + rnorm(1, 0, simulation@sigma[i]) | |
median_EV_control[i, age + 1 - min_age] <- sum(predictors_control_median*simulation@coef[i,]) + rnorm(1, 0, simulation@sigma[i]) | |
} | |
} | |
treatment_effect_median <- median_EV_treated - median_EV_control | |
CI_treatment_effect_median<- apply(treatment_effect_median, 2, quantile, probs = c(0.025, 0.975)) | |
treatment_effect_median_table_1 <- t(data.frame(CI_treatment_effect_median)) | |
treatment_effect_median_table_1 | |
#install.packages("robustbase") | |
Median_treatment_effect <- colMedians(treatment_effect_median) | |
treatment_effect_median_table <- cbind(Median_treatment_effect, treatment_effect_median_table_1) | |
treatment_effect_median_table | |
plot(x = c(1:100), y = c(1:100), type = "n", | |
xlim = c(min_age,max_age), | |
ylim = c(-24000,24000), | |
main = " Treatment effect by Age", xlab = "Age", | |
ylab = "Treatment effect") | |
for (age in min_age:max_age) { | |
segments (x0 = age, y0 = CI_treatment_effect_median[1, age - min_age + 1], x1 = age, y1 = CI_treatment_effect_median[2, age - min_age + 1],lwd = 2) | |
} | |
#_________________________________3_________________________________ | |
#loading the data | |
foo <- read.csv(url("https://tinyurl.com/y2prc9xq")) | |
head(foo) | |
#regression with MATH_SCORE as a dependent variable and TREATMENT as a predictor | |
regression_foo <-lm(foo$MATH_SCORE~ foo$TREATMENT) | |
#computing confidence intervals for parameters in regression_foo model | |
regression_CI <- confint(regression_foo) [2,] | |
regression_CI #ANSWER: 95% of the values lie within 2.5th and 97.5th percentiles - (2.758678, 4.998793) | |
#creating storage vector for bootstrapped values | |
iterations <- 10000 | |
TREATMENT_coefficients <- rep(NA, iterations) | |
#a bootstrapping function for resampling values of coefficients for treatment | |
for (i in 1:iterations) { | |
regression_foo_bootstrap = lm(MATH_SCORE~TREATMENT, data = foo[sample(1:nrow(foo), nrow(foo), replace = T),]) | |
TREATMENT_coefficients[i] <- regression_foo_bootstrap$coefficients[2] | |
} | |
#creating a histogram showing bootstrap-sample results of coefficients for TREATMENT | |
hist(TREATMENT_coefficients, border="black", col="moccasin", lwd=2, | |
xlab = "Bootstrapped values of coefficients for TREATMENT", | |
ylab = "Frequency") | |
#calculating 95% confidence intervals of the bootstrapped values | |
bootstrap_CI <- quantile(TREATMENT_coefficients, c(0.025, 0.975)) | |
bootstrap_CI | |
#creating a table combining CI obtained from both bootstrapping and simple regression | |
data.frame(Bootsrapping = bootstrap_CI, Regression=regression_CI) | |
#_________________________________4_________________________________ | |
#bootstrap function that takes actual Ys and predicted Ys as inputs and outputs R^2 | |
rsquared <- function(actual_ys, predicted_ys){ | |
r_squared <- rep(NA, 5000) | |
for (i in 1:5000) { | |
resample <- sample(1:length(predicted_ys), length(predicted_ys), replace = T) | |
#stores bootstrapped results in the matrix | |
r_squared[i] <- 1- (sum((actual_ys[resample] - predicted_ys[resample])**2)/sum((actual_ys[resample] - mean(actual_ys[resample]))**2)) | |
} | |
#calculates mean R^2 | |
return(mean(r_squared)) | |
} | |
#checking if function works on afterschool data from Question #3 | |
NAs_in_MATH_SCORE <-which(is.na(foo$MATH_SCORE)) | |
no_NAs_foo <-foo[-NAs_in_MATH_SCORE, ] | |
regression_foo_no_NAs <- lm(no_NAs_foo$MATH_SCORE~no_NAs_foo$TREATMENT) | |
predicted_values <- predict(regression_foo_no_NAs) | |
rsquared(no_NAs_foo$MATH_SCORE, regression_foo$fitted.values) | |
#________________________________5_________________________________ | |
data_5 <- read.csv(url("https://tinyurl.com/yx8tqf3k")) | |
head(data_5) | |
set.seed(12345) | |
#randomly removing 2000 observations and setting it aside as a test set; creating training set | |
test_set_rows <- sample(1:length(data_5$age), 2000, replace = FALSE) | |
training_set <- data_5[-test_set_rows,] | |
test_set <-data_5[test_set_rows,] | |
print(test_set) | |
print(training_set) | |
#creating 5 logistic regression models | |
model_1 <- glm(treat~ married +hispanic + education+ u74, data =training_set,family=binomial) | |
model_2 <- glm(treat~ married, data =training_set) | |
model_3 <- glm(treat~ hispanic + education+ u74+ black + nodegree, data =training_set) | |
model_4 <- glm(treat~ u74 +u75,data =training_set) | |
model_5 <- glm(treat~ age+ education+ black+ hispanic+ married+ nodegree+re74+re75+ u74+ u75, data =training_set) | |
#running corss-validation for these models | |
LOOCV1 <- cv.glm(training_set,model_1) | |
LOOCV2 <- cv.glm(training_set,model_2) | |
LOOCV3 <- cv.glm(training_set,model_3) | |
LOOCV4 <- cv.glm(training_set,model_4) | |
LOOCV5 <- cv.glm(training_set,model_5) | |
?cv.glm | |
LOOCV5$delta[1] #raw cross-validation estimate of prediction error. | |
#calculating MSE for regression models; test set error misclassification rates make more sense for lgm but I'm very tired of this assignment, so MSE it is :( | |
predicted_values_1 <-predict.glm(model_1, test_set, type = "response") | |
predicted_values_2 <-predict.glm(model_2, test_set, type = "response") | |
predicted_values_3 <-predict.glm(model_3, test_set, type = "response") | |
predicted_values_4 <-predict.glm(model_4, test_set, type = "response") | |
predicted_values_5 <-predict.glm(model_5, test_set, type = "response") | |
MSE_1 <- mean((predicted_values_1- test_set$treat)^2) | |
MSE_2 <- mean((predicted_values_2- test_set$treat)^2) | |
MSE_3 <- mean((predicted_values_3- test_set$treat)^2) | |
MSE_4 <- mean((predicted_values_4- test_set$treat)^2) | |
MSE_5 <- mean((predicted_values_5- test_set$treat)^2) | |
MSE_1 | |
MSE_2 | |
MSE_3 | |
MSE_4 | |
MSE_5 | |
#putting everything in a table | |
table_5_comparison <- data.frame("LOOCV"=rbind(LOOCV1$delta[1],LOOCV2$delta[1],LOOCV3$delta[1],LOOCV4$delta[1],LOOCV5$delta[1]), | |
"MSE"=rbind(MSE_1,MSE_2,MSE_3,MSE_4,MSE_5)) | |
table_5_comparison |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment