Aaronmoralesshildrick/CS112Assignment2

## CS112Assignment2
#Question 1
set.seed(123)
#Creation of data
X <- c(runif(99, min = 0, max = 12))
Y = (study_time^2 * 5) + rnorm(99, mean = 0, sd = 50)
mydata = data.frame(X, Y)

#Creation of the models and plots (also calls for summary function are here)
plot(Y~X, main = "How Outliers Affect Regression")
lm1 = lm(Y~X, data = mydata)
abline(lm1, col = "blue")
abline(lm2, col = "red")
summary(lm1)
#Plot and data with the extra observation
outlierdata = rbind(mydata, c(0, 20000))
plot(Y~X, data = outlierdata, main = "Full Data with Outlier Position")

lm2 = lm(Y~X, data = outlierdata)
abline(lm2)
summary(lm2)


#Question 2
excluded <- which(lalonde$treat == 1)
lalonde.control <- lalonde[-excluded, ] #Only control group patients

lm_control = lm(re78 ~ age + educ + re74 +re75 +
                educ*re74 + educ*re75 + age*re74 + age*re75 + re74*re75,
                data = lalonde.control) #Linear model with the specified predictors
lm_control$coef
#intercept = 3685.51, age = 2.21, educ = 39.06, re74 = -0.015, re75 = 0.784,
#educ:re74 = 0.0344, educ:re75 = -0.072, age:re74 = -0.0057, age:re75 = 0.0093,
#re74:re75 = -0.000022

set.seed(123)
sim_results <- sim(lm_control, n.sims = 10000)
mean(sim_results@coef[,10]) #try this with all numbers for predictors gives following data
#            From sim         From lm_control
#Intercept   3645.201           3685.51
#age         2.234              2.21
#educ        43.319             39.06
#re74        -0.023             -0.015
#re75        0.832               0.784
#educ:re74    0.033             0.0344
#educ:re75    -0.074            -0.072
#age:re74     -0.0051           -0.0057
#age:re75     0.008             0.0093
#re74:re75   -0.0000229          -0.000022

sigmas <- c(sim_results@sigma)
coefs <- c(sim_results@coef)

#find the medians
med.edu = median(lalonde.control$educ)
med.re74 = median(lalonde.control$re74)
med.re75 = median(lalonde.control$re75)
ages = c(min(lalonde.control$age): max(lalonde.control$age))

set.seed(1)
stored = data.frame() #Create an empty data frame

#Nested for loop that keeps valued at their medians and only age varies.
#Includes sigmas
for(j in ages){
  unit = c(1, j, med.edu, 0, 0, 0, 0, 0, 0, 0)
  for(i in 1:10000){
    stored[i, j - 16] <-
      sum(unit * sim_results@coef[i,] )+ rnorm(1, 0, sim_results@sigma[i])
  }
}
stored
confints = apply(stored, 2, quantile, probs = c(0.025, 0.975)) #find confidence interval per age
confints2
#plot the graph
plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-20000,30000),
     main = "Confidence intervals per age with variables at their medians", xlab = "Age",
     ylab = "Real earnings in 1978")

for (age in ages) {
  segments(
    x0 = age,
    y0 = confints[1, age - 16],
    x1 = age,
    y1 = confints[2, age - 16],
    lwd = 2)
}

#90 Quantile problem
#set values at their 90th quantile
edu.90 = quantile(lalonde.control$educ, probs = c(0.9))
re74.90 = quantile(lalonde.control$re74, probs = c(0.9))
re75.90 = quantile(lalonde.control$re75, probs = c(0.9))

set.seed(1)
stored2 = data.frame()
#repeat process with new values and data frame.
for(j in ages){
  unit = c(1, j, edu.90, re74.90, re75.90, j*re74.90, j*re75.90, edu.90*re74.90, edu.90*re75.90, re74.90*re75.90)
  for(i in 1:10000){
    stored2[i, j - 16] <-
      sum(unit * sim_results@coef[i,] )+ rnorm(1, 0, sim_results@sigma[i])
  }
}
head(stored2)
confints2 = apply(stored2, 2, quantile, probs = c(0.025, 0.975))

plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-70000,70000),
     main = "Confidence intervals per age with variables at their 90th quantile", xlab = "Age",
     ylab = "Real earnings in 1978")

for (age in ages) {
  segments(
    x0 = age,
    y0 = confints2[1, age - 16],
    x1 = age,
    y1 = confints2[2, age - 16],
    lwd = 2)
}


#Question 3
library("foreign")
nsw.dta <- read.dta("/Users/aaron/Documents/Minerva/CS\ 112/Assignments/Assignment\ 2/nsw.dta")
nsw.dta
set.seed(123)
lmnsw = lm(re78~treat, data = nsw.dta) #linear model
storage <- c(rep (0, 10000)) #empty vector

#bootstrapp and create a model for each repetition. Store the coeficients in the vector
for(i in 1:10000){
  index <- sample(1:nrow(nsw.dta), nrow(nsw.dta), replace = TRUE)
  boot.sample = nsw.dta[index, ]
  bootmodel = lm(re78~treat, data = boot.sample)
  storage[i] = bootmodel$coefficients[2]
}
quant = quantile(storage, probs = c(0.025, 0.975))
quant
confint(lmnsw) #compare with the function created above

hist(storage, xlab = "Value of coefficients", main = "Histogram for bootstrapped coefficients",
     col = "steelblue4")

#Question 4
r.sqr <- function(y, y1){
  SE <- sum((y1 - y) ^ 2)  ## Squared error
  ME <- sum((y - mean(y)) ^ 2)  ## error from mean
  rsq <- 1 - SE/ME
  return(rsq)
}

testmodel <- lm(re75~education * age + age + education, data = nsw.dta)
preds <- predict(testmodel)
r.sqr(nsw.dta$re75, preds)

#Question 5
nsw <- read.dta("/Users/aaron/Documents/Minerva/CS\ 112/Assignments/Assignment\ 2/nsw.dta")
nsw <- nsw[, -1]
model1 <- lm(treat~.-re78, data = nsw, family = "binomial")
summary(model1)
pred <- predict(model1, type = "response")
proba <- data.frame(nsw$treat, pred)
head(proba)
treat <- subset(proba, proba$nsw.treat == 1)
control <- subset(proba, proba$nsw.treat == 0)
head(treat)
head(control)

hist(treat$pred, ylim = c(0,125), xlim = c(0.3, 0.6), col = "firebrick", xlab = "Probabilities",
     main = "Histogram for probabilities in the treatment group")
hist(control$pred, ylim = c(0,200), xlim = c(0.3, 0.55), col = "dodgerblue4",
     main = "Histogram for probabilities in the control group")
	#Question 1
	set.seed(123)
	#Creation of data
	X <- c(runif(99, min = 0, max = 12))
	Y = (study_time^2 * 5) + rnorm(99, mean = 0, sd = 50)
	mydata = data.frame(X, Y)

	#Creation of the models and plots (also calls for summary function are here)
	plot(Y~X, main = "How Outliers Affect Regression")
	lm1 = lm(Y~X, data = mydata)
	abline(lm1, col = "blue")
	abline(lm2, col = "red")
	summary(lm1)
	#Plot and data with the extra observation
	outlierdata = rbind(mydata, c(0, 20000))
	plot(Y~X, data = outlierdata, main = "Full Data with Outlier Position")

	lm2 = lm(Y~X, data = outlierdata)
	abline(lm2)
	summary(lm2)


	#Question 2
	excluded <- which(lalonde$treat == 1)
	lalonde.control <- lalonde[-excluded, ] #Only control group patients

	lm_control = lm(re78 ~ age + educ + re74 +re75 +
	educre74 + educre75 + agere74 + agere75 + re74*re75,
	data = lalonde.control) #Linear model with the specified predictors
	lm_control$coef
	#intercept = 3685.51, age = 2.21, educ = 39.06, re74 = -0.015, re75 = 0.784,
	#educ:re74 = 0.0344, educ:re75 = -0.072, age:re74 = -0.0057, age:re75 = 0.0093,
	#re74:re75 = -0.000022

	set.seed(123)
	sim_results <- sim(lm_control, n.sims = 10000)
	mean(sim_results@coef[,10]) #try this with all numbers for predictors gives following data
	# From sim From lm_control
	#Intercept 3645.201 3685.51
	#age 2.234 2.21
	#educ 43.319 39.06
	#re74 -0.023 -0.015
	#re75 0.832 0.784
	#educ:re74 0.033 0.0344
	#educ:re75 -0.074 -0.072
	#age:re74 -0.0051 -0.0057
	#age:re75 0.008 0.0093
	#re74:re75 -0.0000229 -0.000022

	sigmas <- c(sim_results@sigma)
	coefs <- c(sim_results@coef)

	#find the medians
	med.edu = median(lalonde.control$educ)
	med.re74 = median(lalonde.control$re74)
	med.re75 = median(lalonde.control$re75)
	ages = c(min(lalonde.control$age): max(lalonde.control$age))

	set.seed(1)
	stored = data.frame() #Create an empty data frame

	#Nested for loop that keeps valued at their medians and only age varies.
	#Includes sigmas
	for(j in ages){
	unit = c(1, j, med.edu, 0, 0, 0, 0, 0, 0, 0)
	for(i in 1:10000){
	stored[i, j - 16] <-
	sum(unit * sim_results@coef[i,] )+ rnorm(1, 0, sim_results@sigma[i])
	}
	}
	stored
	confints = apply(stored, 2, quantile, probs = c(0.025, 0.975)) #find confidence interval per age
	confints2
	#plot the graph
	plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-20000,30000),
	main = "Confidence intervals per age with variables at their medians", xlab = "Age",
	ylab = "Real earnings in 1978")

	for (age in ages) {
	segments(
	x0 = age,
	y0 = confints[1, age - 16],
	x1 = age,
	y1 = confints[2, age - 16],
	lwd = 2)
	}

	#90 Quantile problem
	#set values at their 90th quantile
	edu.90 = quantile(lalonde.control$educ, probs = c(0.9))
	re74.90 = quantile(lalonde.control$re74, probs = c(0.9))
	re75.90 = quantile(lalonde.control$re75, probs = c(0.9))

	set.seed(1)
	stored2 = data.frame()
	#repeat process with new values and data frame.
	for(j in ages){
	unit = c(1, j, edu.90, re74.90, re75.90, jre74.90, jre75.90, edu.90re74.90, edu.90re75.90, re74.90*re75.90)
	for(i in 1:10000){
	stored2[i, j - 16] <-
	sum(unit * sim_results@coef[i,] )+ rnorm(1, 0, sim_results@sigma[i])
	}
	}
	head(stored2)
	confints2 = apply(stored2, 2, quantile, probs = c(0.025, 0.975))

	plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-70000,70000),
	main = "Confidence intervals per age with variables at their 90th quantile", xlab = "Age",
	ylab = "Real earnings in 1978")

	for (age in ages) {
	segments(
	x0 = age,
	y0 = confints2[1, age - 16],
	x1 = age,
	y1 = confints2[2, age - 16],
	lwd = 2)
	}


	#Question 3
	library("foreign")
	nsw.dta <- read.dta("/Users/aaron/Documents/Minerva/CS\ 112/Assignments/Assignment\ 2/nsw.dta")
	nsw.dta
	set.seed(123)
	lmnsw = lm(re78~treat, data = nsw.dta) #linear model
	storage <- c(rep (0, 10000)) #empty vector

	#bootstrapp and create a model for each repetition. Store the coeficients in the vector
	for(i in 1:10000){
	index <- sample(1:nrow(nsw.dta), nrow(nsw.dta), replace = TRUE)
	boot.sample = nsw.dta[index, ]
	bootmodel = lm(re78~treat, data = boot.sample)
	storage[i] = bootmodel$coefficients[2]
	}
	quant = quantile(storage, probs = c(0.025, 0.975))
	quant
	confint(lmnsw) #compare with the function created above

	hist(storage, xlab = "Value of coefficients", main = "Histogram for bootstrapped coefficients",
	col = "steelblue4")

	#Question 4
	r.sqr <- function(y, y1){
	SE <- sum((y1 - y) ^ 2) ## Squared error
	ME <- sum((y - mean(y)) ^ 2) ## error from mean
	rsq <- 1 - SE/ME
	return(rsq)
	}

	testmodel <- lm(re75~education * age + age + education, data = nsw.dta)
	preds <- predict(testmodel)
	r.sqr(nsw.dta$re75, preds)

	#Question 5
	nsw <- read.dta("/Users/aaron/Documents/Minerva/CS\ 112/Assignments/Assignment\ 2/nsw.dta")
	nsw <- nsw[, -1]
	model1 <- lm(treat~.-re78, data = nsw, family = "binomial")
	summary(model1)
	pred <- predict(model1, type = "response")
	proba <- data.frame(nsw$treat, pred)
	head(proba)
	treat <- subset(proba, proba$nsw.treat == 1)
	control <- subset(proba, proba$nsw.treat == 0)
	head(treat)
	head(control)

	hist(treat$pred, ylim = c(0,125), xlim = c(0.3, 0.6), col = "firebrick", xlab = "Probabilities",
	main = "Histogram for probabilities in the treatment group")
	hist(control$pred, ylim = c(0,200), xlim = c(0.3, 0.55), col = "dodgerblue4",
	main = "Histogram for probabilities in the control group")