Skip to content

Instantly share code, notes, and snippets.

@Anna-Nevm
Last active September 26, 2020 03:56
Show Gist options
  • Save Anna-Nevm/90a0c7bdd4a70b955b7e5586b32ac5be to your computer and use it in GitHub Desktop.
Save Anna-Nevm/90a0c7bdd4a70b955b7e5586b32ac5be to your computer and use it in GitHub Desktop.
Classic Univariate Regression and Bootstrapping (Lalonde dataset)
#_________________________________1_________________________________
#generating 998 observations for X from uniform distribution
x <- runif(998, min = 0, max=10)
# data-generating equation for y = 15 - 0.25x + e;
y = 15 - 0.055 * x + rnorm(998, 0.5, 0.2)
#e=rnorm(998, 0.5, 0.2) - irreducible error drawn from normal distribution, should average to 0.5 with 0.2 SD
#creating data frame with predicted values of y for each x
df_no_outliers <- data.frame(x, y)
#creating simple univariate regression
regression <- lm(y ~ x, data = df_no_outliers)
summary(regression)
#getting regression coefficients to later use for plotting the regression line
coef(regression)
intercept <- coef(regression)[1]
slope <-coef(regression)[2]
#creating a new dataset with two outliers
df_with_outliers <- rbind(df_no_outliers, c(0.5,-50), c(0.3,-38))
#these really small values of y would pull the regression line down to the left and change the value of the slope to positive
#creating simple regression for the dataset with outliers
regression_with_outliers <- lm(y ~ x, data = df_with_outliers)
summary(regression_with_outliers)
coef(regression)
coef(regression_with_outliers)
#plotting the data with regression lines for regression on both datasets
intercept_outliers <-coef(regression_with_outliers)[1]
slope_outliers <-coef(regression_with_outliers)[2]
plot(df_no_outliers,
xlab = "x",
ylab = "y",
pch=21, cex = .4, col=c("peachpuff2"))
abline(a = intercept,
b = slope,
col = "deepskyblue3",
lwd=2)
abline(a = intercept_outliers,
b = slope_outliers,
col = "brown1",lwd=2)
#________________________________2_________________________________
library(Matching)
data(lalonde)
#adding two independent variables from the assignment prompt - age^2 and treat*age- to the lalonde dataset
lalonde$age_squared <- lalonde$age^2
lalonde$t_a <- lalonde$age*lalonde$treat
#checking whether new columns were added
colnames(lalonde)
#creating a linear model that predicts re78 as a linear additive function of age, age^2, educ, treat, treat*age, re74, and re75
regression <- lm(lalonde$re78 ~ lalonde$age + lalonde$age_squared + lalonde$educ + lalonde$treat + lalonde$t_a + lalonde$re74 + lalonde$re75)
summary(regression)
coef(regression)
#creating a simulation based on regression model for the whole lalonde dataset
library('arm')
iterations <- 10000
simulation <- sim(regression, n.sims=iterations)
#storing min and max age values for easier reference (17 and 55)
min_age <- min(lalonde$age)
max_age <- max(lalonde$age)
#___________QUESTION 2-a________
#the 95% interval of expected values for re78, for every TREATED UNIT
#creating a dataset that only contains data for treated units
lalonde_treated <- lalonde[which(lalonde$treat == 1), ]
#creating a matrix populated with NAs that would be later used to store expected values
simulated_EV_treated <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age)) #age range: 17-55
#creating variables with the values of means of educ, re74 and re75 to further use them for holding these predictors constant during the simulation of expected values
mean_educ_treated<- mean(lalonde_treated$educ)
mean_re74_treated <- mean(lalonde_treated$re74)
mean_re75_treated <- mean(lalonde_treated$re75)
mean_educ_treated
mean_re74_treated
mean_re75_treated
#checking the order of the coefficients in the simulation so it can be matched when populating expected values
colnames(simulation_treated@coef)
#generating expected values
for (age in min_age:max_age) {
#1 is the intercept which has to be kept at the same value as in simulation
#lalonde_treated$treat = 1
predictors_treated <- c(1, age, age^2, mean_educ_treated, 1, age*1, mean_re74_treated, mean_re75_treated)
#populating the matrix with expected values
for (i in 1:iterations) {
#i - row, [17+1-minimum age] - proper index for the column; multiplying all simulated coefficients by their corresponding predictors
simulated_EV_treated[i, age + 1 - min_age] <- sum(predictors_treated*simulation@coef[i,])
}
}
#predictors_treated
#simulated_EV_treated
simulated_EV_treated
#calculating 95% confidence intervals
CI_mean_treated <- apply(simulated_EV_treated, 2, quantile, probs = c(0.025, 0.975))
#putting them in a data frame for a better display
table_treated_mean <- t(data.frame(CI_mean_treated))
#adding mean expected value for each age
Mean_treated <- colMeans(simulated_EV_treated)
treated_table <- cbind("Mean expected value"= Mean_treated, table_treated_mean)
treated_table
#plotting the prediction intervals for each age
plot(x = c(1:100), y = c(1:100), type = "n",
xlim = c(min_age,max_age),
ylim = c(-20000,20000),
main = "Expected Value of Real Earnings in 1978 by Age with Predictors Held at the Mean (treatment group)", xlab = "Age",
ylab = "Expected value of Re78")
for (age in min_age:max_age) {
segments (x0 = age, y0 = CI_mean_treated[1, age - min_age + 1], x1 = age, y1 = CI_mean_treated[2, age - min_age + 1],lwd = 2)
}
#___________QUESTION 2-b________
# the 95% interval of expected values for re78, for every CONTROL unit
#repeating the above for control units
lalonde_control<- lalonde[which(lalonde$treat == 0), ]
#empty matrix for storing expected values for control units
simulated_EV_control <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age))
#creating variables with the values of means of educ, re74 and r75 to further use them for holding these predictors constant during the simulation of expected values
mean_educ_control<- mean(lalonde_control$educ)
mean_re74_control <- mean(lalonde_control$re74)
mean_re75_control <- mean(lalonde_control$re75)
mean_educ_control
mean_re74_control
mean_re75_control
for (age in min_age:max_age) {
#lalonde_control$treat is 0
predictors_control <- c(1, age, age^2, mean_educ, 0, age*0, mean_re74_control, mean_re75_control)
for (i in 1:iterations) {
simulated_EV_control[i, age + 1 - min_age] <- sum(predictors_control*simulation@coef[i,])
}
}
simulated_EV_control
CI_mean_control <- apply(simulated_EV_control, 2, quantile, probs = c(0.025, 0.975))
table_control_mean <- t(data.frame(CI_mean_control))
Mean_control<- colMeans(simulated_EV_control)
control_table <- cbind("Mean expected value"=Mean_control, table_control_mean)
control_table
plot(x = c(1:100), y = c(1:100), type = "n",
xlim = c(min(lalonde_ctrl$age),max(lalonde_ctrl$age)),
ylim = c(-10000,20000),
#think these through
main = "Expected Value of Real Earnings in 1978 by Age with Predictors Held at the Mean (control group)", xlab = "Age",
ylab = "Expected value of Re78")
for (age in min_age:max_age) {
segments (x0 = age, y0 = CI_mean_control[1, age - min_age + 1], x1 = age, y1 = CI_mean_control[2, age - min_age + 1],lwd = 2)
}
#___________QUESTION 2-c________
#the 95% interval of expected values for the treatment effect
simulated_EV_treated
simulated_EV_control
#calculating the treatment effect and corresponding confidence intervals
treatment_effect <- simulated_EV_treated - simulated_EV_control
CI_treatment_effect <- apply(treatment_effect, 2, quantile, probs = c(0.025, 0.975))
#storing results in a tables and adding mean expected values for treatment effect for each age
treatment_effect_table_1 <- t(data.frame(CI_treatment_effect))
Mean_treatment_effect <- colMeans(CI_treatment_effect)
treatment_effect_table <- cbind("Mean expected treatment effect" =Mean_treatment_effect, treatment_effect_table_1)
treatment_effect_table
plot(x = c(1:100), y = c(1:100), type = "n",
xlim = c(min_age,max_age),
ylim = c(-10000,20000),
main = " Treatment effect by Age", xlab = "Age",
ylab = "Treatment effect")
for (age in min_age:max_age) {
segments (x0 = age, y0 = CI_treatment_effect[1, age - min_age + 1], x1 = age, y1 = CI_treatment_effect[2, age - min_age + 1],lwd = 2)
}
#___________QUESTION 2-d________
#using analogical code to 2a-2c to calculate the 95% prediction interval for the treatment effect holding predictors at the median
median_EV_treated <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age))
median_EV_control <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age))
median_educ_treated<- median(lalonde_treated$educ)
median_re74_treated <- median(lalonde_treated$re74)
median_re75_treated <- median(lalonde_treated$re75)
median_educ_control<- median(lalonde_control$educ)
median_re74_control <- median(lalonde_control$re74)
median_re75_control <- median(lalonde_control$re75)
median_educ_treated
median_re74_treated
median_re75_treated
median_educ_control
median_re74_control
median_re75_control
for (age in min_age:max_age) {
predictors_treated_median <- c(1, age, age^2, median_educ_treated, 1, age*1, median_re74_treated, median_re75_treated)
predictors_control_median <- c(1, age, age^2, median_educ_control, 0, age*0, median_re74_control, median_re75_control)
for (i in 1:iterations) {
median_EV_treated[i, age + 1 - min_age] <- sum(predictors_treated_median*simulation@coef[i,]) + rnorm(1, 0, simulation@sigma[i])
median_EV_control[i, age + 1 - min_age] <- sum(predictors_control_median*simulation@coef[i,]) + rnorm(1, 0, simulation@sigma[i])
}
}
treatment_effect_median <- median_EV_treated - median_EV_control
CI_treatment_effect_median<- apply(treatment_effect_median, 2, quantile, probs = c(0.025, 0.975))
treatment_effect_median_table_1 <- t(data.frame(CI_treatment_effect_median))
treatment_effect_median_table_1
#install.packages("robustbase")
Median_treatment_effect <- colMedians(treatment_effect_median)
treatment_effect_median_table <- cbind(Median_treatment_effect, treatment_effect_median_table_1)
treatment_effect_median_table
plot(x = c(1:100), y = c(1:100), type = "n",
xlim = c(min_age,max_age),
ylim = c(-24000,24000),
main = " Treatment effect by Age", xlab = "Age",
ylab = "Treatment effect")
for (age in min_age:max_age) {
segments (x0 = age, y0 = CI_treatment_effect_median[1, age - min_age + 1], x1 = age, y1 = CI_treatment_effect_median[2, age - min_age + 1],lwd = 2)
}
#_________________________________3_________________________________
#loading the data
foo <- read.csv(url("https://tinyurl.com/y2prc9xq"))
head(foo)
#regression with MATH_SCORE as a dependent variable and TREATMENT as a predictor
regression_foo <-lm(foo$MATH_SCORE~ foo$TREATMENT)
#computing confidence intervals for parameters in regression_foo model
regression_CI <- confint(regression_foo) [2,]
regression_CI #ANSWER: 95% of the values lie within 2.5th and 97.5th percentiles - (2.758678, 4.998793)
#creating storage vector for bootstrapped values
iterations <- 10000
TREATMENT_coefficients <- rep(NA, iterations)
#a bootstrapping function for resampling values of coefficients for treatment
for (i in 1:iterations) {
regression_foo_bootstrap = lm(MATH_SCORE~TREATMENT, data = foo[sample(1:nrow(foo), nrow(foo), replace = T),])
TREATMENT_coefficients[i] <- regression_foo_bootstrap$coefficients[2]
}
#creating a histogram showing bootstrap-sample results of coefficients for TREATMENT
hist(TREATMENT_coefficients, border="black", col="moccasin", lwd=2,
xlab = "Bootstrapped values of coefficients for TREATMENT",
ylab = "Frequency")
#calculating 95% confidence intervals of the bootstrapped values
bootstrap_CI <- quantile(TREATMENT_coefficients, c(0.025, 0.975))
bootstrap_CI
#creating a table combining CI obtained from both bootstrapping and simple regression
data.frame(Bootsrapping = bootstrap_CI, Regression=regression_CI)
#_________________________________4_________________________________
#bootstrap function that takes actual Ys and predicted Ys as inputs and outputs R^2
rsquared <- function(actual_ys, predicted_ys){
r_squared <- rep(NA, 5000)
for (i in 1:5000) {
resample <- sample(1:length(predicted_ys), length(predicted_ys), replace = T)
#stores bootstrapped results in the matrix
r_squared[i] <- 1- (sum((actual_ys[resample] - predicted_ys[resample])**2)/sum((actual_ys[resample] - mean(actual_ys[resample]))**2))
}
#calculates mean R^2
return(mean(r_squared))
}
#checking if function works on afterschool data from Question #3
NAs_in_MATH_SCORE <-which(is.na(foo$MATH_SCORE))
no_NAs_foo <-foo[-NAs_in_MATH_SCORE, ]
regression_foo_no_NAs <- lm(no_NAs_foo$MATH_SCORE~no_NAs_foo$TREATMENT)
predicted_values <- predict(regression_foo_no_NAs)
rsquared(no_NAs_foo$MATH_SCORE, regression_foo$fitted.values)
#________________________________5_________________________________
data_5 <- read.csv(url("https://tinyurl.com/yx8tqf3k"))
head(data_5)
set.seed(12345)
#randomly removing 2000 observations and setting it aside as a test set; creating training set
test_set_rows <- sample(1:length(data_5$age), 2000, replace = FALSE)
training_set <- data_5[-test_set_rows,]
test_set <-data_5[test_set_rows,]
print(test_set)
print(training_set)
#creating 5 logistic regression models
model_1 <- glm(treat~ married +hispanic + education+ u74, data =training_set,family=binomial)
model_2 <- glm(treat~ married, data =training_set)
model_3 <- glm(treat~ hispanic + education+ u74+ black + nodegree, data =training_set)
model_4 <- glm(treat~ u74 +u75,data =training_set)
model_5 <- glm(treat~ age+ education+ black+ hispanic+ married+ nodegree+re74+re75+ u74+ u75, data =training_set)
#running corss-validation for these models
LOOCV1 <- cv.glm(training_set,model_1)
LOOCV2 <- cv.glm(training_set,model_2)
LOOCV3 <- cv.glm(training_set,model_3)
LOOCV4 <- cv.glm(training_set,model_4)
LOOCV5 <- cv.glm(training_set,model_5)
?cv.glm
LOOCV5$delta[1] #raw cross-validation estimate of prediction error.
#calculating MSE for regression models; test set error misclassification rates make more sense for lgm but I'm very tired of this assignment, so MSE it is :(
predicted_values_1 <-predict.glm(model_1, test_set, type = "response")
predicted_values_2 <-predict.glm(model_2, test_set, type = "response")
predicted_values_3 <-predict.glm(model_3, test_set, type = "response")
predicted_values_4 <-predict.glm(model_4, test_set, type = "response")
predicted_values_5 <-predict.glm(model_5, test_set, type = "response")
MSE_1 <- mean((predicted_values_1- test_set$treat)^2)
MSE_2 <- mean((predicted_values_2- test_set$treat)^2)
MSE_3 <- mean((predicted_values_3- test_set$treat)^2)
MSE_4 <- mean((predicted_values_4- test_set$treat)^2)
MSE_5 <- mean((predicted_values_5- test_set$treat)^2)
MSE_1
MSE_2
MSE_3
MSE_4
MSE_5
#putting everything in a table
table_5_comparison <- data.frame("LOOCV"=rbind(LOOCV1$delta[1],LOOCV2$delta[1],LOOCV3$delta[1],LOOCV4$delta[1],LOOCV5$delta[1]),
"MSE"=rbind(MSE_1,MSE_2,MSE_3,MSE_4,MSE_5))
table_5_comparison
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment