Anna-Nevm/Regression and Bootstrapping.R

## Regression and Bootstrapping.R


#_________________________________1_________________________________

#generating 998 observations for X from uniform distribution
x <- runif(998, min = 0, max=10)

# data-generating equation for  y =  15 - 0.25x + e;
y = 15 - 0.055 * x +  rnorm(998, 0.5, 0.2)
#e=rnorm(998, 0.5, 0.2) - irreducible error drawn from normal distribution, should average to 0.5 with 0.2 SD

#creating data frame with predicted values of y for each x
df_no_outliers <- data.frame(x, y)

#creating simple univariate regression
regression <- lm(y ~ x, data = df_no_outliers)
summary(regression)

#getting regression coefficients to later use for plotting the regression line
coef(regression)
intercept <- coef(regression)[1]
slope <-coef(regression)[2]

#creating a new dataset with two outliers
df_with_outliers <- rbind(df_no_outliers, c(0.5,-50), c(0.3,-38))
#these really small values of y would pull the regression line down to the left and change the value of the slope to positive

#creating simple regression for the dataset with outliers
regression_with_outliers <- lm(y ~ x, data = df_with_outliers)
summary(regression_with_outliers)


coef(regression)
coef(regression_with_outliers)


#plotting the data with regression lines for regression on both datasets
intercept_outliers <-coef(regression_with_outliers)[1]
slope_outliers <-coef(regression_with_outliers)[2]

plot(df_no_outliers,
     xlab = "x",
     ylab = "y",
     pch=21, cex = .4, col=c("peachpuff2"))
abline(a = intercept,
       b = slope,
       col = "deepskyblue3",
       lwd=2)
abline(a = intercept_outliers,
       b = slope_outliers,
       col = "brown1",lwd=2)


#________________________________2_________________________________

library(Matching)
data(lalonde)

#adding two independent variables from the assignment prompt - age^2 and treat*age- to the lalonde dataset
lalonde$age_squared <- lalonde$age^2
lalonde$t_a <- lalonde$age*lalonde$treat
#checking whether new columns were added
colnames(lalonde)

#creating a linear model that predicts re78 as a linear additive function of age, age^2, educ, treat, treat*age, re74, and re75
regression <- lm(lalonde$re78 ~ lalonde$age + lalonde$age_squared + lalonde$educ + lalonde$treat + lalonde$t_a + lalonde$re74 + lalonde$re75)
summary(regression)
coef(regression)

#creating a simulation based on regression model for the whole lalonde dataset
library('arm')
iterations <- 10000
simulation <- sim(regression, n.sims=iterations)

#storing min and max age values for easier reference (17 and 55)
min_age <- min(lalonde$age)
max_age <- max(lalonde$age)


#___________QUESTION 2-a________
#the 95% interval of expected values for re78, for every TREATED UNIT

#creating a dataset that only contains data for treated units
lalonde_treated <- lalonde[which(lalonde$treat == 1), ]

#creating a matrix populated with NAs that would be later used to store expected values
simulated_EV_treated <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age)) #age range: 17-55

#creating variables with the values of means of educ, re74 and re75 to further use them for holding these predictors constant during the simulation of expected values
mean_educ_treated<- mean(lalonde_treated$educ)
mean_re74_treated <- mean(lalonde_treated$re74)
mean_re75_treated <- mean(lalonde_treated$re75)

mean_educ_treated
mean_re74_treated
mean_re75_treated

#checking the order of the coefficients in the simulation so it can be matched when populating expected values
colnames(simulation_treated@coef)

#generating expected values
for (age in min_age:max_age) {
  #1 is the intercept which has to be kept at the same value as in simulation
  #lalonde_treated$treat = 1
  predictors_treated <- c(1, age, age^2, mean_educ_treated, 1, age*1, mean_re74_treated, mean_re75_treated)

  #populating the matrix with expected values
  for (i in 1:iterations) {
    #i - row, [17+1-minimum age] - proper index for the column; multiplying all simulated coefficients by their corresponding predictors
    simulated_EV_treated[i, age + 1 - min_age] <- sum(predictors_treated*simulation@coef[i,])
  }
}

#predictors_treated
#simulated_EV_treated

simulated_EV_treated
#calculating 95% confidence intervals
CI_mean_treated <- apply(simulated_EV_treated, 2, quantile, probs = c(0.025, 0.975))
#putting them in a data frame for a better display
table_treated_mean <- t(data.frame(CI_mean_treated))
#adding mean expected value for each age
Mean_treated <- colMeans(simulated_EV_treated)
treated_table <- cbind("Mean expected value"= Mean_treated, table_treated_mean)
treated_table

#plotting the prediction intervals for each age
plot(x = c(1:100), y = c(1:100), type = "n",
     xlim = c(min_age,max_age),
     ylim = c(-20000,20000),
     main = "Expected Value of Real Earnings in 1978 by Age with Predictors Held at the Mean (treatment group)", xlab = "Age",
     ylab = "Expected value of Re78")
for (age in min_age:max_age) {
  segments (x0 = age, y0 = CI_mean_treated[1, age - min_age + 1], x1 = age, y1 = CI_mean_treated[2, age - min_age + 1],lwd = 2)
}


#___________QUESTION 2-b________

# the 95% interval of expected values for re78, for every CONTROL unit
#repeating the above for control units

lalonde_control<- lalonde[which(lalonde$treat == 0), ]

#empty matrix for storing expected values for control units
simulated_EV_control <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age))

#creating variables with the values of means of educ, re74 and r75 to further use them for holding these predictors constant during the simulation of expected values
mean_educ_control<- mean(lalonde_control$educ)
mean_re74_control <- mean(lalonde_control$re74)
mean_re75_control <- mean(lalonde_control$re75)
mean_educ_control
mean_re74_control
mean_re75_control

for (age in min_age:max_age) {
  #lalonde_control$treat is 0
  predictors_control <- c(1, age, age^2, mean_educ, 0, age*0, mean_re74_control, mean_re75_control)
  for (i in 1:iterations) {
    simulated_EV_control[i, age + 1 - min_age] <- sum(predictors_control*simulation@coef[i,])
  }
}
simulated_EV_control

CI_mean_control <- apply(simulated_EV_control, 2, quantile, probs = c(0.025, 0.975))

table_control_mean <- t(data.frame(CI_mean_control))
Mean_control<- colMeans(simulated_EV_control)
control_table <- cbind("Mean expected value"=Mean_control, table_control_mean)
control_table


plot(x = c(1:100), y = c(1:100), type = "n",
     xlim = c(min(lalonde_ctrl$age),max(lalonde_ctrl$age)),
     ylim = c(-10000,20000),
     #think these through
     main = "Expected Value of Real Earnings in 1978 by Age with Predictors Held at the Mean (control group)", xlab = "Age",
     ylab = "Expected value of Re78")
for (age in min_age:max_age) {
  segments (x0 = age, y0 = CI_mean_control[1, age - min_age + 1], x1 = age, y1 = CI_mean_control[2, age - min_age + 1],lwd = 2)
}


#___________QUESTION 2-c________
#the 95% interval of expected values for the treatment effect

simulated_EV_treated
simulated_EV_control

#calculating the treatment effect and corresponding confidence intervals
treatment_effect <- simulated_EV_treated - simulated_EV_control
CI_treatment_effect <- apply(treatment_effect, 2, quantile, probs = c(0.025, 0.975))

#storing results in a tables and adding mean expected values for treatment effect for each age
treatment_effect_table_1 <- t(data.frame(CI_treatment_effect))
Mean_treatment_effect <- colMeans(CI_treatment_effect)
treatment_effect_table <- cbind("Mean expected treatment effect" =Mean_treatment_effect, treatment_effect_table_1)
treatment_effect_table


plot(x = c(1:100), y = c(1:100), type = "n",
     xlim = c(min_age,max_age),
     ylim = c(-10000,20000),
     main = " Treatment effect by Age", xlab = "Age",
     ylab = "Treatment effect")
for (age in min_age:max_age) {
  segments (x0 = age, y0 = CI_treatment_effect[1, age - min_age + 1], x1 = age, y1 = CI_treatment_effect[2, age - min_age + 1],lwd = 2)
}


#___________QUESTION 2-d________

#using analogical code to 2a-2c to calculate the 95% prediction interval for the treatment effect holding predictors at the median

median_EV_treated <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age))
median_EV_control <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age))

median_educ_treated<- median(lalonde_treated$educ)
median_re74_treated <- median(lalonde_treated$re74)
median_re75_treated <- median(lalonde_treated$re75)

median_educ_control<- median(lalonde_control$educ)
median_re74_control <- median(lalonde_control$re74)
median_re75_control <- median(lalonde_control$re75)

median_educ_treated
median_re74_treated
median_re75_treated

median_educ_control
median_re74_control
median_re75_control


for (age in min_age:max_age) {
  predictors_treated_median <- c(1, age, age^2, median_educ_treated, 1, age*1, median_re74_treated, median_re75_treated)
  predictors_control_median <- c(1, age, age^2, median_educ_control, 0, age*0, median_re74_control, median_re75_control)

  for (i in 1:iterations) {
    median_EV_treated[i, age + 1 - min_age] <- sum(predictors_treated_median*simulation@coef[i,]) + rnorm(1, 0, simulation@sigma[i])
    median_EV_control[i, age + 1 - min_age] <- sum(predictors_control_median*simulation@coef[i,]) + rnorm(1, 0, simulation@sigma[i])
  }
}

treatment_effect_median <- median_EV_treated - median_EV_control
CI_treatment_effect_median<- apply(treatment_effect_median, 2, quantile, probs = c(0.025, 0.975))

treatment_effect_median_table_1 <- t(data.frame(CI_treatment_effect_median))
treatment_effect_median_table_1

#install.packages("robustbase")
Median_treatment_effect <- colMedians(treatment_effect_median)
treatment_effect_median_table <- cbind(Median_treatment_effect, treatment_effect_median_table_1)
treatment_effect_median_table


plot(x = c(1:100), y = c(1:100), type = "n",
     xlim = c(min_age,max_age),
     ylim = c(-24000,24000),
     main = " Treatment effect by Age", xlab = "Age",
     ylab = "Treatment effect")
for (age in min_age:max_age) {
  segments (x0 = age, y0 = CI_treatment_effect_median[1, age - min_age + 1], x1 = age, y1 = CI_treatment_effect_median[2, age - min_age + 1],lwd = 2)
}


#_________________________________3_________________________________

#loading the data
foo <- read.csv(url("https://tinyurl.com/y2prc9xq"))
head(foo)

#regression with MATH_SCORE as a dependent variable and TREATMENT as a predictor
regression_foo <-lm(foo$MATH_SCORE~ foo$TREATMENT)

#computing confidence intervals for parameters in regression_foo model
regression_CI <- confint(regression_foo) [2,]
regression_CI #ANSWER: 95% of the values lie within 2.5th and 97.5th percentiles - (2.758678, 4.998793)


#creating storage vector for bootstrapped values
iterations <- 10000
TREATMENT_coefficients <- rep(NA, iterations)
#a bootstrapping function for resampling values of coefficients for treatment
for (i in 1:iterations) {
  regression_foo_bootstrap = lm(MATH_SCORE~TREATMENT, data = foo[sample(1:nrow(foo), nrow(foo), replace = T),])
  TREATMENT_coefficients[i] <- regression_foo_bootstrap$coefficients[2]
}

#creating a histogram showing bootstrap-sample results of coefficients for TREATMENT
hist(TREATMENT_coefficients, border="black", col="moccasin", lwd=2,
     xlab = "Bootstrapped values of coefficients for TREATMENT",
     ylab = "Frequency")


#calculating 95% confidence intervals of the bootstrapped values
bootstrap_CI <- quantile(TREATMENT_coefficients, c(0.025, 0.975))
bootstrap_CI

#creating a table combining CI obtained from both bootstrapping and simple regression
data.frame(Bootsrapping = bootstrap_CI, Regression=regression_CI)


#_________________________________4_________________________________


#bootstrap function that takes actual Ys and predicted Ys as inputs and outputs R^2
rsquared <- function(actual_ys, predicted_ys){
  r_squared <- rep(NA, 5000)
  for (i in 1:5000) {
    resample <- sample(1:length(predicted_ys), length(predicted_ys), replace = T)
    #stores bootstrapped results in the matrix
    r_squared[i] <- 1- (sum((actual_ys[resample] - predicted_ys[resample])**2)/sum((actual_ys[resample] - mean(actual_ys[resample]))**2))
  }
#calculates mean R^2
    return(mean(r_squared))
}

#checking if function works on afterschool data from Question #3
NAs_in_MATH_SCORE <-which(is.na(foo$MATH_SCORE))
no_NAs_foo <-foo[-NAs_in_MATH_SCORE, ]
regression_foo_no_NAs <- lm(no_NAs_foo$MATH_SCORE~no_NAs_foo$TREATMENT)
predicted_values <- predict(regression_foo_no_NAs)
rsquared(no_NAs_foo$MATH_SCORE, regression_foo$fitted.values)


#________________________________5_________________________________

data_5 <- read.csv(url("https://tinyurl.com/yx8tqf3k"))
head(data_5)
set.seed(12345)


#randomly removing 2000 observations and setting it aside as a test set; creating training set
test_set_rows <- sample(1:length(data_5$age), 2000, replace = FALSE)
training_set <- data_5[-test_set_rows,]
test_set <-data_5[test_set_rows,]

print(test_set)
print(training_set)

#creating 5 logistic regression models
model_1 <- glm(treat~ married +hispanic + education+ u74, data =training_set,family=binomial)
model_2 <- glm(treat~ married, data =training_set)
model_3 <- glm(treat~ hispanic + education+ u74+ black + nodegree, data =training_set)
model_4 <- glm(treat~ u74 +u75,data =training_set)
model_5 <- glm(treat~ age+ education+ black+ hispanic+ married+ nodegree+re74+re75+ u74+ u75, data =training_set)

#running corss-validation for these models
LOOCV1 <- cv.glm(training_set,model_1)
LOOCV2 <- cv.glm(training_set,model_2)
LOOCV3 <- cv.glm(training_set,model_3)
LOOCV4 <- cv.glm(training_set,model_4)
LOOCV5 <- cv.glm(training_set,model_5)

?cv.glm
LOOCV5$delta[1] #raw cross-validation estimate of prediction error.

#calculating MSE for regression models; test set error misclassification rates make more sense for lgm but I'm very tired of this assignment, so MSE it is :(
predicted_values_1 <-predict.glm(model_1, test_set, type = "response")
predicted_values_2 <-predict.glm(model_2, test_set, type = "response")
predicted_values_3 <-predict.glm(model_3, test_set, type = "response")
predicted_values_4 <-predict.glm(model_4, test_set, type = "response")
predicted_values_5 <-predict.glm(model_5, test_set, type = "response")

MSE_1 <- mean((predicted_values_1- test_set$treat)^2)
MSE_2 <- mean((predicted_values_2- test_set$treat)^2)
MSE_3 <- mean((predicted_values_3- test_set$treat)^2)
MSE_4 <- mean((predicted_values_4- test_set$treat)^2)
MSE_5 <- mean((predicted_values_5- test_set$treat)^2)

MSE_1
MSE_2
MSE_3
MSE_4
MSE_5

#putting everything in a table
table_5_comparison <- data.frame("LOOCV"=rbind(LOOCV1$delta[1],LOOCV2$delta[1],LOOCV3$delta[1],LOOCV4$delta[1],LOOCV5$delta[1]),
                                 "MSE"=rbind(MSE_1,MSE_2,MSE_3,MSE_4,MSE_5))
table_5_comparison



	#_________________________________1_________________________________

	#generating 998 observations for X from uniform distribution
	x <- runif(998, min = 0, max=10)

	# data-generating equation for y = 15 - 0.25x + e;
	y = 15 - 0.055 * x + rnorm(998, 0.5, 0.2)
	#e=rnorm(998, 0.5, 0.2) - irreducible error drawn from normal distribution, should average to 0.5 with 0.2 SD

	#creating data frame with predicted values of y for each x
	df_no_outliers <- data.frame(x, y)

	#creating simple univariate regression
	regression <- lm(y ~ x, data = df_no_outliers)
	summary(regression)

	#getting regression coefficients to later use for plotting the regression line
	coef(regression)
	intercept <- coef(regression)[1]
	slope <-coef(regression)[2]

	#creating a new dataset with two outliers
	df_with_outliers <- rbind(df_no_outliers, c(0.5,-50), c(0.3,-38))
	#these really small values of y would pull the regression line down to the left and change the value of the slope to positive

	#creating simple regression for the dataset with outliers
	regression_with_outliers <- lm(y ~ x, data = df_with_outliers)
	summary(regression_with_outliers)


	coef(regression)
	coef(regression_with_outliers)


	#plotting the data with regression lines for regression on both datasets
	intercept_outliers <-coef(regression_with_outliers)[1]
	slope_outliers <-coef(regression_with_outliers)[2]

	plot(df_no_outliers,
	xlab = "x",
	ylab = "y",
	pch=21, cex = .4, col=c("peachpuff2"))
	abline(a = intercept,
	b = slope,
	col = "deepskyblue3",
	lwd=2)
	abline(a = intercept_outliers,
	b = slope_outliers,
	col = "brown1",lwd=2)






	#________________________________2_________________________________

	library(Matching)
	data(lalonde)

	#adding two independent variables from the assignment prompt - age^2 and treat*age- to the lalonde dataset
	lalonde$age_squared <- lalonde$age^2
	lalonde$t_a <- lalonde$age*lalonde$treat
	#checking whether new columns were added
	colnames(lalonde)

	#creating a linear model that predicts re78 as a linear additive function of age, age^2, educ, treat, treat*age, re74, and re75
	regression <- lm(lalonde$re78 ~ lalonde$age + lalonde$age_squared + lalonde$educ + lalonde$treat + lalonde$t_a + lalonde$re74 + lalonde$re75)
	summary(regression)
	coef(regression)

	#creating a simulation based on regression model for the whole lalonde dataset
	library('arm')
	iterations <- 10000
	simulation <- sim(regression, n.sims=iterations)

	#storing min and max age values for easier reference (17 and 55)
	min_age <- min(lalonde$age)
	max_age <- max(lalonde$age)



	#___________QUESTION 2-a________
	#the 95% interval of expected values for re78, for every TREATED UNIT

	#creating a dataset that only contains data for treated units
	lalonde_treated <- lalonde[which(lalonde$treat == 1), ]

	#creating a matrix populated with NAs that would be later used to store expected values
	simulated_EV_treated <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age)) #age range: 17-55

	#creating variables with the values of means of educ, re74 and re75 to further use them for holding these predictors constant during the simulation of expected values
	mean_educ_treated<- mean(lalonde_treated$educ)
	mean_re74_treated <- mean(lalonde_treated$re74)
	mean_re75_treated <- mean(lalonde_treated$re75)

	mean_educ_treated
	mean_re74_treated
	mean_re75_treated

	#checking the order of the coefficients in the simulation so it can be matched when populating expected values
	colnames(simulation_treated@coef)

	#generating expected values
	for (age in min_age:max_age) {
	#1 is the intercept which has to be kept at the same value as in simulation
	#lalonde_treated$treat = 1
	predictors_treated <- c(1, age, age^2, mean_educ_treated, 1, age*1, mean_re74_treated, mean_re75_treated)

	#populating the matrix with expected values
	for (i in 1:iterations) {
	#i - row, [17+1-minimum age] - proper index for the column; multiplying all simulated coefficients by their corresponding predictors
	simulated_EV_treated[i, age + 1 - min_age] <- sum(predictors_treated*simulation@coef[i,])
	}
	}

	#predictors_treated
	#simulated_EV_treated

	simulated_EV_treated
	#calculating 95% confidence intervals
	CI_mean_treated <- apply(simulated_EV_treated, 2, quantile, probs = c(0.025, 0.975))
	#putting them in a data frame for a better display
	table_treated_mean <- t(data.frame(CI_mean_treated))
	#adding mean expected value for each age
	Mean_treated <- colMeans(simulated_EV_treated)
	treated_table <- cbind("Mean expected value"= Mean_treated, table_treated_mean)
	treated_table

	#plotting the prediction intervals for each age
	plot(x = c(1:100), y = c(1:100), type = "n",
	xlim = c(min_age,max_age),
	ylim = c(-20000,20000),
	main = "Expected Value of Real Earnings in 1978 by Age with Predictors Held at the Mean (treatment group)", xlab = "Age",
	ylab = "Expected value of Re78")
	for (age in min_age:max_age) {
	segments (x0 = age, y0 = CI_mean_treated[1, age - min_age + 1], x1 = age, y1 = CI_mean_treated[2, age - min_age + 1],lwd = 2)
	}


	#___________QUESTION 2-b________

	# the 95% interval of expected values for re78, for every CONTROL unit
	#repeating the above for control units

	lalonde_control<- lalonde[which(lalonde$treat == 0), ]

	#empty matrix for storing expected values for control units
	simulated_EV_control <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age))

	#creating variables with the values of means of educ, re74 and r75 to further use them for holding these predictors constant during the simulation of expected values
	mean_educ_control<- mean(lalonde_control$educ)
	mean_re74_control <- mean(lalonde_control$re74)
	mean_re75_control <- mean(lalonde_control$re75)
	mean_educ_control
	mean_re74_control
	mean_re75_control

	for (age in min_age:max_age) {
	#lalonde_control$treat is 0
	predictors_control <- c(1, age, age^2, mean_educ, 0, age*0, mean_re74_control, mean_re75_control)
	for (i in 1:iterations) {
	simulated_EV_control[i, age + 1 - min_age] <- sum(predictors_control*simulation@coef[i,])
	}
	}
	simulated_EV_control

	CI_mean_control <- apply(simulated_EV_control, 2, quantile, probs = c(0.025, 0.975))

	table_control_mean <- t(data.frame(CI_mean_control))
	Mean_control<- colMeans(simulated_EV_control)
	control_table <- cbind("Mean expected value"=Mean_control, table_control_mean)
	control_table


	plot(x = c(1:100), y = c(1:100), type = "n",
	xlim = c(min(lalonde_ctrl$age),max(lalonde_ctrl$age)),
	ylim = c(-10000,20000),
	#think these through
	main = "Expected Value of Real Earnings in 1978 by Age with Predictors Held at the Mean (control group)", xlab = "Age",
	ylab = "Expected value of Re78")
	for (age in min_age:max_age) {
	segments (x0 = age, y0 = CI_mean_control[1, age - min_age + 1], x1 = age, y1 = CI_mean_control[2, age - min_age + 1],lwd = 2)
	}


	#___________QUESTION 2-c________
	#the 95% interval of expected values for the treatment effect

	simulated_EV_treated
	simulated_EV_control

	#calculating the treatment effect and corresponding confidence intervals
	treatment_effect <- simulated_EV_treated - simulated_EV_control
	CI_treatment_effect <- apply(treatment_effect, 2, quantile, probs = c(0.025, 0.975))

	#storing results in a tables and adding mean expected values for treatment effect for each age
	treatment_effect_table_1 <- t(data.frame(CI_treatment_effect))
	Mean_treatment_effect <- colMeans(CI_treatment_effect)
	treatment_effect_table <- cbind("Mean expected treatment effect" =Mean_treatment_effect, treatment_effect_table_1)
	treatment_effect_table


	plot(x = c(1:100), y = c(1:100), type = "n",
	xlim = c(min_age,max_age),
	ylim = c(-10000,20000),
	main = " Treatment effect by Age", xlab = "Age",
	ylab = "Treatment effect")
	for (age in min_age:max_age) {
	segments (x0 = age, y0 = CI_treatment_effect[1, age - min_age + 1], x1 = age, y1 = CI_treatment_effect[2, age - min_age + 1],lwd = 2)
	}



	#___________QUESTION 2-d________

	#using analogical code to 2a-2c to calculate the 95% prediction interval for the treatment effect holding predictors at the median

	median_EV_treated <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age))
	median_EV_control <- matrix(NA, nrow = iterations, ncol = length(min_age:max_age))

	median_educ_treated<- median(lalonde_treated$educ)
	median_re74_treated <- median(lalonde_treated$re74)
	median_re75_treated <- median(lalonde_treated$re75)

	median_educ_control<- median(lalonde_control$educ)
	median_re74_control <- median(lalonde_control$re74)
	median_re75_control <- median(lalonde_control$re75)

	median_educ_treated
	median_re74_treated
	median_re75_treated

	median_educ_control
	median_re74_control
	median_re75_control


	for (age in min_age:max_age) {
	predictors_treated_median <- c(1, age, age^2, median_educ_treated, 1, age*1, median_re74_treated, median_re75_treated)
	predictors_control_median <- c(1, age, age^2, median_educ_control, 0, age*0, median_re74_control, median_re75_control)

	for (i in 1:iterations) {
	median_EV_treated[i, age + 1 - min_age] <- sum(predictors_treated_median*simulation@coef[i,]) + rnorm(1, 0, simulation@sigma[i])
	median_EV_control[i, age + 1 - min_age] <- sum(predictors_control_median*simulation@coef[i,]) + rnorm(1, 0, simulation@sigma[i])
	}
	}

	treatment_effect_median <- median_EV_treated - median_EV_control
	CI_treatment_effect_median<- apply(treatment_effect_median, 2, quantile, probs = c(0.025, 0.975))

	treatment_effect_median_table_1 <- t(data.frame(CI_treatment_effect_median))
	treatment_effect_median_table_1

	#install.packages("robustbase")
	Median_treatment_effect <- colMedians(treatment_effect_median)
	treatment_effect_median_table <- cbind(Median_treatment_effect, treatment_effect_median_table_1)
	treatment_effect_median_table


	plot(x = c(1:100), y = c(1:100), type = "n",
	xlim = c(min_age,max_age),
	ylim = c(-24000,24000),
	main = " Treatment effect by Age", xlab = "Age",
	ylab = "Treatment effect")
	for (age in min_age:max_age) {
	segments (x0 = age, y0 = CI_treatment_effect_median[1, age - min_age + 1], x1 = age, y1 = CI_treatment_effect_median[2, age - min_age + 1],lwd = 2)
	}




	#_________________________________3_________________________________

	#loading the data
	foo <- read.csv(url("https://tinyurl.com/y2prc9xq"))
	head(foo)

	#regression with MATH_SCORE as a dependent variable and TREATMENT as a predictor
	regression_foo <-lm(foo$MATH_SCORE~ foo$TREATMENT)

	#computing confidence intervals for parameters in regression_foo model
	regression_CI <- confint(regression_foo) [2,]
	regression_CI #ANSWER: 95% of the values lie within 2.5th and 97.5th percentiles - (2.758678, 4.998793)


	#creating storage vector for bootstrapped values
	iterations <- 10000
	TREATMENT_coefficients <- rep(NA, iterations)
	#a bootstrapping function for resampling values of coefficients for treatment
	for (i in 1:iterations) {
	regression_foo_bootstrap = lm(MATH_SCORE~TREATMENT, data = foo[sample(1:nrow(foo), nrow(foo), replace = T),])
	TREATMENT_coefficients[i] <- regression_foo_bootstrap$coefficients[2]
	}

	#creating a histogram showing bootstrap-sample results of coefficients for TREATMENT
	hist(TREATMENT_coefficients, border="black", col="moccasin", lwd=2,
	xlab = "Bootstrapped values of coefficients for TREATMENT",
	ylab = "Frequency")


	#calculating 95% confidence intervals of the bootstrapped values
	bootstrap_CI <- quantile(TREATMENT_coefficients, c(0.025, 0.975))
	bootstrap_CI

	#creating a table combining CI obtained from both bootstrapping and simple regression
	data.frame(Bootsrapping = bootstrap_CI, Regression=regression_CI)



	#_________________________________4_________________________________


	#bootstrap function that takes actual Ys and predicted Ys as inputs and outputs R^2
	rsquared <- function(actual_ys, predicted_ys){
	r_squared <- rep(NA, 5000)
	for (i in 1:5000) {
	resample <- sample(1:length(predicted_ys), length(predicted_ys), replace = T)
	#stores bootstrapped results in the matrix
	r_squared[i] <- 1- (sum((actual_ys[resample] - predicted_ys[resample])2)/sum((actual_ys[resample] - mean(actual_ys[resample]))2))
	}
	#calculates mean R^2
	return(mean(r_squared))
	}

	#checking if function works on afterschool data from Question #3
	NAs_in_MATH_SCORE <-which(is.na(foo$MATH_SCORE))
	no_NAs_foo <-foo[-NAs_in_MATH_SCORE, ]
	regression_foo_no_NAs <- lm(no_NAs_foo$MATH_SCORE~no_NAs_foo$TREATMENT)
	predicted_values <- predict(regression_foo_no_NAs)
	rsquared(no_NAs_foo$MATH_SCORE, regression_foo$fitted.values)


	#________________________________5_________________________________

	data_5 <- read.csv(url("https://tinyurl.com/yx8tqf3k"))
	head(data_5)
	set.seed(12345)


	#randomly removing 2000 observations and setting it aside as a test set; creating training set
	test_set_rows <- sample(1:length(data_5$age), 2000, replace = FALSE)
	training_set <- data_5[-test_set_rows,]
	test_set <-data_5[test_set_rows,]

	print(test_set)
	print(training_set)

	#creating 5 logistic regression models
	model_1 <- glm(treat~ married +hispanic + education+ u74, data =training_set,family=binomial)
	model_2 <- glm(treat~ married, data =training_set)
	model_3 <- glm(treat~ hispanic + education+ u74+ black + nodegree, data =training_set)
	model_4 <- glm(treat~ u74 +u75,data =training_set)
	model_5 <- glm(treat~ age+ education+ black+ hispanic+ married+ nodegree+re74+re75+ u74+ u75, data =training_set)

	#running corss-validation for these models
	LOOCV1 <- cv.glm(training_set,model_1)
	LOOCV2 <- cv.glm(training_set,model_2)
	LOOCV3 <- cv.glm(training_set,model_3)
	LOOCV4 <- cv.glm(training_set,model_4)
	LOOCV5 <- cv.glm(training_set,model_5)

	?cv.glm
	LOOCV5$delta[1] #raw cross-validation estimate of prediction error.

	#calculating MSE for regression models; test set error misclassification rates make more sense for lgm but I'm very tired of this assignment, so MSE it is :(
	predicted_values_1 <-predict.glm(model_1, test_set, type = "response")
	predicted_values_2 <-predict.glm(model_2, test_set, type = "response")
	predicted_values_3 <-predict.glm(model_3, test_set, type = "response")
	predicted_values_4 <-predict.glm(model_4, test_set, type = "response")
	predicted_values_5 <-predict.glm(model_5, test_set, type = "response")

	MSE_1 <- mean((predicted_values_1- test_set$treat)^2)
	MSE_2 <- mean((predicted_values_2- test_set$treat)^2)
	MSE_3 <- mean((predicted_values_3- test_set$treat)^2)
	MSE_4 <- mean((predicted_values_4- test_set$treat)^2)
	MSE_5 <- mean((predicted_values_5- test_set$treat)^2)

	MSE_1
	MSE_2
	MSE_3
	MSE_4
	MSE_5

	#putting everything in a table
	table_5_comparison <- data.frame("LOOCV"=rbind(LOOCV1$delta[1],LOOCV2$delta[1],LOOCV3$delta[1],LOOCV4$delta[1],LOOCV5$delta[1]),
	"MSE"=rbind(MSE_1,MSE_2,MSE_3,MSE_4,MSE_5))
	table_5_comparison