anglilian/Q1_CS112_Assignment2.r

## Q1_CS112_Assignment2.r
  library(ggplot2)
library(arm)
###QUESTION 1###
##PART A##
#Loading data from the CSV file
sesame <- read.csv("https://tinyurl.com/w1g163b")

#Splitting the data into two datasets with control and treatment data
sesame_treat <- subset(sesame, sesame$treatment == 1)
sesame_cont <- subset(sesame, sesame$treatment == 0)

#Creating a linear regression model for treatment and control data
lm_pre_treat <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data = sesame_treat)
lm_pre_cont <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data=sesame_cont)

#Plotting the data for pre test and post test along with the respective regression lines
plot(x = sesame_cont$pre.test, y = sesame_cont$post.test, main = "Grade 4" ,
     pch = 16, xlab = "Pre test", ylab = "Post test", xlim= c(0, max(sesame$pre.test)), ylim = c(0,max(sesame$post.test)))
lines (x = sesame_treat$pre.test, y=sesame_treat$post.test, type = "p")
abline(lm_pre_treat)
abline(lm_pre_cont, lty= 'dashed')

##PART B##
sesame_edit <- read.csv("https://tinyurl.com/w1g163b")

#Change point 4 from post test to 45 from 104.6.
sesame_edit[5, 1] = 45

#Splitting the data into two datasets with control and treatment data
sesame_treat_edit <- subset(sesame_edit, sesame_edit$treatment == 1)
sesame_cont_edit <- subset(sesame_edit, sesame_edit$treatment == 0)

#Creating a linear regression model for treatment and control data
lm_treat <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data = sesame_treat_edit)
lm_cont <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data=sesame_cont_edit)

#Plotting the data for pre test and post test along with the respective regression lines
plot(x = sesame_treat_edit$pre.test, y=sesame_treat_edit$post.test, main = "Grade 4" ,
    xlab = "Pre test", ylab = "Post test", col=ifelse(sesame_edit$post.test==45.0, "red", "black"), pch=ifelse(sesame_edit$post.test==45.0,16, 1) )
lines (x = sesame_cont_edit$pre.test, y = sesame_cont_edit$post.test,
       type = "p", pch = 16
       ) #changes the colour of the point to red
abline(lm_treat)
abline(lm_cont, lty= 'dashed')

##PART C##
#Create a linear regression model for the post test score
lm.4 <- lm(post.test ~ treatment + pre.test + treatment:pre.test,
           data = sesame)

1#simulate the coefficients of the model
lm.4.sim <- sim(lm.4)
lm.4.sim

# creating an empty plot
plot(0, 0, xlim= range(sesame$pre.test), ylim = c(-5,10),
     xlab = "Pre-test", ylab = "Treatment effect",
     main = "Treatment Effect in Grade 4")

#adds a dotted line along the x-axis
abline (0,0, lwd = .5, lty =2)

#adds 20 lines to the plot using the simulated coefficients
for (i in 1:20){
  curve(lm.4.sim@coef[i,2]+lm.4.sim@coef[i,4]*x,
        lwd=.5, col="gray", add= TRUE)}

#adds a line that is the difference between the two regression lines in the previous plot
curve(coef(lm.4)[2]+coef(lm.4)[4]*x, lwd=.5, add=TRUE)

#histogram of pre test scores
hist(sesame$pre.test)

## Q2_CS112_Assignment2.r
###QUESTION 2###
tinting = read.csv(url("https://tinyurl.com/v4bq99k")) #loading data
lm.tint <- lm(csoa~ age + sex + target + I(tint!="no") +
                I(as.numeric(tint!="no")*age), data = tinting) #setting the linear model

#Simulating 95% interval and mean of csoa
set.seed(123) #setting randomness
iteration <- 1000 #number of simulations to run
sim_treated <-sim(lm.tint, n.sims=iteration) #running the simulations 1000 times on linear model

agevector <- c(20,30,40,50,60,70,80)
storage.matrix <- matrix(NA, nrow=1000, ncol = 7) #creating a matrix to store the 1000 csoa for each age
for (age in agevector){ #iterating through each age in age vector
  for (i in 1:1000) { #1000 times for each age
    treated <- c(1, age, 0, 0, TRUE, as.numeric(TRUE)*age) #setting the variables as stipulated
    storage.matrix[i, age/10-1] <-sum(treated*sim_treated@coef[i,]) #multiplying each variable by its simulated coefficient, summing it and storing it in a matrix as the predicted csoa
  }
}


quantile(storage.matrix[,7], c(0.025,0.975))

#Creating a table of results
means <- apply(storage.matrix, 2, mean) #computing mean of each age
conf.intervals <- apply(storage.matrix, 2, quantile, probs = c(0.025, 0.975)) #computing 2.5 and 97.5 percentile of each age
table_ages <- rbind(means, conf.intervals) #putting data together in one table
colnames(table_ages) <- c(20,30,40,50,60,70,80) #change column names to appropriate age
table_ages

#Plotting results onto a graph
plot(x = c(1:100), xlim= c(20,80) , ylim = c(30,70), type = "n", xlab = "Age (Years)",
     ylab = "CSOA", main= "Expected Values of Treated Females")
for (age in agevector) {
  segments( #plots the confidence intervals of each age
    x0 = age,
    y0 = conf.intervals[1, age/10-1],
    x1 = age,
    y1 = conf.intervals[2, age/10-1],
    lwd = 2)
}
lines(agevector, means, pch=16, col='red', type='p') #plot the means of each age


##PART B##
storage.matrix.cont <- matrix(NA, nrow=1000, ncol = 7) #creating a matrix to store the 1000 csoa for each age

#Simulating csoa values for control group
for (age in agevector){ #iterating through each age in age vector
  for (i in 1:1000) { #1000 times for each age
    untreated <- c(1, age, 0, 0, 0 , as.numeric(FALSE)*age) #setting the variables as stipulated
    storage.matrix.cont[i, age/10-1] <-sum(untreated*sim_treated@coef[i,]) #multiplying each variable by its simulated coefficient, summing it and storing it in a matrix as the predicted csoa
  }
}

#calculating the treatment effect for each entry by subtracting the treated from the controlled
storage.treat <- storage.matrix - storage.matrix.cont

#Creating a table of results
means.treat <-apply(storage.treat, 2, mean) #computing mean of each age
conf.intervals.treat <- apply(storage.treat, 2, quantile, probs = c(0.025, 0.975)) #computing 2.5 and 97.5 percentile of each age
table_ages.treat <- rbind(means.treat, conf.intervals.treat) #putting data together in one table
colnames(table_ages.treat) <- c(20,30,40,50,60,70,80) #change column names to appropriate age
table_ages.treat

#Plotting results onto a graph
plot(x = c(1:100), xlim= c(20,80) , ylim = c(-10,10), type = "n", xlab = "Age (Years)",
     ylab = "CSOA", main= "Treatment Effect of Females")
for (age in agevector) {
  segments(
    x0 = age,
    y0 = conf.intervals.treat[1, age/10-1],
    x1 = age,
    y1 = conf.intervals.treat[2, age/10-1],
    lwd = 2)
}
lines(agevector, means.treat, pch=16, col='red', type='p') #plotting each of the means

## Q3_CS112_Assignment2.r
###QUESTION 3###

#Function for R squared that takes the two arguments: y and the predicted y
rsquare <- function(y, yhat) {
  sumsquaredregression <- 0
  totalsumofsquares <- 0
  for (i in 1:length(y)) {
  sumsquaredregression <- sumsquaredregression + (y[i]-yhat[i])^2 #sum of squared regression
  totalsumofsquares <- totalsumofsquares + (y[i]-mean(y))^2 #total sum of squares
  }
  print (paste("The R-squared value is",
        1-(sumsquaredregression/totalsumofsquares))) #equation for R squared
}

#loading data for lalonde
library(Matching)
data(lalonde)

#creating a linear model for lalonde to predict for re78
lm.lalonde <- lm(re78 ~., data= lalonde)
yhat <- predict(lm.lalonde)

#Output r-squared values
rsquare(lalonde$re78, yhat)
summary(lm.lalonde)$r.squared

## Q4_CS112_Assignment2.r
##Question 4###
#loading data
library(foreign)
maze <- read.dta("mazedata1.dta")

#setting the treatment to be binary where Caste Revealed is 1 and 0 otherwise
treat <- ifelse(maze$treatment == "Caste Revealed", 1, 0)
maze <- data.frame(maze, treat) #appending binary version of treatment to dataframe

#holds all the round1 values for treatment group
treated <- maze$round1[which(maze$treat==1)]

#holds all the round1 values for control group
control <- maze$round1[which(maze$treat==0)]

#create storage list
store <- c()

#bootstrap the treatment effect for 10,000 samples
for (x in 1:10000) {
  store[x] <- mean(sample(treated, length(treated), replace = TRUE)) -
              mean(sample(control, length(control), replace = TRUE)) #subtract the control from the treatment
}

#store lower and upper boundary of confidence interval of the bootstrapped treatment effects
confidenceinterval <- quantile(store, c(0.025,0.975))

#linear model with round1 as dependent variable and treatment as independent variable
lm.maze = lm(round1 ~ treat, data = maze)

#computing confidence interval using function based on linear model
confint <- rbind (confidenceinterval, confint(lm.maze)[2,]) #attach bootstrapped value in one table
confint

#create a probability density histogram of treatment effect
hist(store, xlab = "Treatment Effect", ylab= 'Probability Density', main= 'Probability Density of Treatment Effect', freq=FALSE)

## Q5_CS112_Assignment2.r
###QUESTION 5###
#loading data
foo <- read.csv(url("https://tinyurl.com/yx8tqf3k"))

#setting randomness
set.seed(12345)

#removing 2000 rows randomly from the original data set
test_set_rows <- sample(1:length(foo$age), 2000, replace =FALSE) #sampling 2000 index numbers to removes
test_foo <- foo[-test_set_rows,] #removing the rows from the data set

#Building the model for surname A-F
glm.simple <- glm(treat~ age, data = test_foo)
glm.complex <- glm(treat~. -re78, data = test_foo)

library(boot)

##LOOCV
#using the cross validation function that by default sets the number of folds to 1 for LOOCV
LOOCV1 = cv.glm(test_foo, glm.simple)$delta[1] #simple model's MSE

LOOCV2 = cv.glm(test_foo, glm.complex)$delta[1] #complex model's MSE

##10-fold cross validation
#using the cross-validation function and setting the number of folds to 10 and averaging the 10 MSEs
cv.error.simple.10 = rep(0,10) #stores each CV result
for(i in 1:10) { #loops 10 times to attach each result
  cv.error.simple.10[i]=cv.glm(test_foo, glm.simple, K=10)$delta[1] #performs 10-fold CV
}
mean(cv.error.simple.10) #computes mean for simple model

cv.error.complex.10 = rep(0,10) #stores each CV result
for(i in 1:10) { #loops 10 times to attach each result
  cv.error.complex.10[i]=cv.glm(test_foo, glm.complex, K=10)$delta[1] #performs 10-fold CV
}
mean(cv.error.complex.10) #computes mean for complex model

##test set error
#subtracts the actual y value with the predicted y value from each model, squares each result and calculates the average

mean((foo[test_set_rows,]$treat -predict.glm(glm.simple, newdata=foo[test_set_rows,]))^2) #simple model

mean((foo[test_set_rows,]$treat -predict.glm(glm.complex, newdata=foo[test_set_rows,]))^2) #complex model

## Q6_CS112_Assignment2.r
###QUESTION 6
trt = matrix(NA,nrow=2,ncol=7)
ctrl = matrix(NA,nrow=2,ncol=7)

trt[,1]=c(0, 2) #18
ctrl[,1]=c(3, 10)
trt[,2]=c(0, 3) #20
ctrl[,2]=c(2, 8)
trt[,3]=c(0, 4) #22
ctrl[,3]=c(2, 7)
trt[,4]=c(1, 3) #24
ctrl[,4]=c(2, 6)
trt[,5]=c(1, 3) #26
ctrl[,5]=c(2, 5)
trt[,6]=c(1, 3) #28
ctrl[,6]=c(2, 4)
trt[,7]=c(1, 2) #30
ctrl[,7]=c(1, 3)


c1 = rgb(red = 1, green = 0, blue = 0, alpha = 0.5) #trt
c2 = rgb(red = 0, green = 0, blue = 1, alpha = 0.5) #ctrl

plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,31), ylim = c(0,11), cex.lab=1.2,
     main = "Alcohol Consumption - 95% Prediction Intervals", xlab = "Age",ylab = "Drinks per Week")

for (age in seq(from=18,to=30,by=2)) {
  segments(x0 = age-0.05, y0 = trt[1, (age-18)/2+1],
           x1 = age-0.05, y1 = trt[2, (age-18)/2+1],lwd = 3,col=c1)

  segments(x0 = age+0.05, y0 = ctrl[1, (age-18)/2+1],
           x1 = age+0.05, y1 = ctrl[2, (age-18)/2+1],lwd = 3,col=c2)
}
legend('topright',legend=c('Treatment','Control'),fill=c(c1,c2))
mtext("https://tinyurl.com/vwxuwop", side = 1, cex = 0.5, adj = 0, padj = 10)
	library(ggplot2)
	library(arm)
	###QUESTION 1###
	##PART A##
	#Loading data from the CSV file
	sesame <- read.csv("https://tinyurl.com/w1g163b")

	#Splitting the data into two datasets with control and treatment data
	sesame_treat <- subset(sesame, sesame$treatment == 1)
	sesame_cont <- subset(sesame, sesame$treatment == 0)

	#Creating a linear regression model for treatment and control data
	lm_pre_treat <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data = sesame_treat)
	lm_pre_cont <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data=sesame_cont)

	#Plotting the data for pre test and post test along with the respective regression lines
	plot(x = sesame_cont$pre.test, y = sesame_cont$post.test, main = "Grade 4" ,
	pch = 16, xlab = "Pre test", ylab = "Post test", xlim= c(0, max(sesame$pre.test)), ylim = c(0,max(sesame$post.test)))
	lines (x = sesame_treat$pre.test, y=sesame_treat$post.test, type = "p")
	abline(lm_pre_treat)
	abline(lm_pre_cont, lty= 'dashed')

	##PART B##
	sesame_edit <- read.csv("https://tinyurl.com/w1g163b")

	#Change point 4 from post test to 45 from 104.6.
	sesame_edit[5, 1] = 45

	#Splitting the data into two datasets with control and treatment data
	sesame_treat_edit <- subset(sesame_edit, sesame_edit$treatment == 1)
	sesame_cont_edit <- subset(sesame_edit, sesame_edit$treatment == 0)

	#Creating a linear regression model for treatment and control data
	lm_treat <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data = sesame_treat_edit)
	lm_cont <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data=sesame_cont_edit)

	#Plotting the data for pre test and post test along with the respective regression lines
	plot(x = sesame_treat_edit$pre.test, y=sesame_treat_edit$post.test, main = "Grade 4" ,
	xlab = "Pre test", ylab = "Post test", col=ifelse(sesame_edit$post.test==45.0, "red", "black"), pch=ifelse(sesame_edit$post.test==45.0,16, 1) )
	lines (x = sesame_cont_edit$pre.test, y = sesame_cont_edit$post.test,
	type = "p", pch = 16
	) #changes the colour of the point to red
	abline(lm_treat)
	abline(lm_cont, lty= 'dashed')

	##PART C##
	#Create a linear regression model for the post test score
	lm.4 <- lm(post.test ~ treatment + pre.test + treatment:pre.test,
	data = sesame)

	1#simulate the coefficients of the model
	lm.4.sim <- sim(lm.4)
	lm.4.sim

	# creating an empty plot
	plot(0, 0, xlim= range(sesame$pre.test), ylim = c(-5,10),
	xlab = "Pre-test", ylab = "Treatment effect",
	main = "Treatment Effect in Grade 4")

	#adds a dotted line along the x-axis
	abline (0,0, lwd = .5, lty =2)

	#adds 20 lines to the plot using the simulated coefficients
	for (i in 1:20){
	curve(lm.4.sim@coef[i,2]+lm.4.sim@coef[i,4]*x,
	lwd=.5, col="gray", add= TRUE)}

	#adds a line that is the difference between the two regression lines in the previous plot
	curve(coef(lm.4)[2]+coef(lm.4)[4]*x, lwd=.5, add=TRUE)

	#histogram of pre test scores
	hist(sesame$pre.test)
	###QUESTION 2###
	tinting = read.csv(url("https://tinyurl.com/v4bq99k")) #loading data
	lm.tint <- lm(csoa~ age + sex + target + I(tint!="no") +
	I(as.numeric(tint!="no")*age), data = tinting) #setting the linear model

	#Simulating 95% interval and mean of csoa
	set.seed(123) #setting randomness
	iteration <- 1000 #number of simulations to run
	sim_treated <-sim(lm.tint, n.sims=iteration) #running the simulations 1000 times on linear model

	agevector <- c(20,30,40,50,60,70,80)
	storage.matrix <- matrix(NA, nrow=1000, ncol = 7) #creating a matrix to store the 1000 csoa for each age
	for (age in agevector){ #iterating through each age in age vector
	for (i in 1:1000) { #1000 times for each age
	treated <- c(1, age, 0, 0, TRUE, as.numeric(TRUE)*age) #setting the variables as stipulated
	storage.matrix[i, age/10-1] <-sum(treated*sim_treated@coef[i,]) #multiplying each variable by its simulated coefficient, summing it and storing it in a matrix as the predicted csoa
	}
	}



	quantile(storage.matrix[,7], c(0.025,0.975))

	#Creating a table of results
	means <- apply(storage.matrix, 2, mean) #computing mean of each age
	conf.intervals <- apply(storage.matrix, 2, quantile, probs = c(0.025, 0.975)) #computing 2.5 and 97.5 percentile of each age
	table_ages <- rbind(means, conf.intervals) #putting data together in one table
	colnames(table_ages) <- c(20,30,40,50,60,70,80) #change column names to appropriate age
	table_ages

	#Plotting results onto a graph
	plot(x = c(1:100), xlim= c(20,80) , ylim = c(30,70), type = "n", xlab = "Age (Years)",
	ylab = "CSOA", main= "Expected Values of Treated Females")
	for (age in agevector) {
	segments( #plots the confidence intervals of each age
	x0 = age,
	y0 = conf.intervals[1, age/10-1],
	x1 = age,
	y1 = conf.intervals[2, age/10-1],
	lwd = 2)
	}
	lines(agevector, means, pch=16, col='red', type='p') #plot the means of each age


	##PART B##
	storage.matrix.cont <- matrix(NA, nrow=1000, ncol = 7) #creating a matrix to store the 1000 csoa for each age

	#Simulating csoa values for control group
	for (age in agevector){ #iterating through each age in age vector
	for (i in 1:1000) { #1000 times for each age
	untreated <- c(1, age, 0, 0, 0 , as.numeric(FALSE)*age) #setting the variables as stipulated
	storage.matrix.cont[i, age/10-1] <-sum(untreated*sim_treated@coef[i,]) #multiplying each variable by its simulated coefficient, summing it and storing it in a matrix as the predicted csoa
	}
	}

	#calculating the treatment effect for each entry by subtracting the treated from the controlled
	storage.treat <- storage.matrix - storage.matrix.cont

	#Creating a table of results
	means.treat <-apply(storage.treat, 2, mean) #computing mean of each age
	conf.intervals.treat <- apply(storage.treat, 2, quantile, probs = c(0.025, 0.975)) #computing 2.5 and 97.5 percentile of each age
	table_ages.treat <- rbind(means.treat, conf.intervals.treat) #putting data together in one table
	colnames(table_ages.treat) <- c(20,30,40,50,60,70,80) #change column names to appropriate age
	table_ages.treat

	#Plotting results onto a graph
	plot(x = c(1:100), xlim= c(20,80) , ylim = c(-10,10), type = "n", xlab = "Age (Years)",
	ylab = "CSOA", main= "Treatment Effect of Females")
	for (age in agevector) {
	segments(
	x0 = age,
	y0 = conf.intervals.treat[1, age/10-1],
	x1 = age,
	y1 = conf.intervals.treat[2, age/10-1],
	lwd = 2)
	}
	lines(agevector, means.treat, pch=16, col='red', type='p') #plotting each of the means
	###QUESTION 3###

	#Function for R squared that takes the two arguments: y and the predicted y
	rsquare <- function(y, yhat) {
	sumsquaredregression <- 0
	totalsumofsquares <- 0
	for (i in 1:length(y)) {
	sumsquaredregression <- sumsquaredregression + (y[i]-yhat[i])^2 #sum of squared regression
	totalsumofsquares <- totalsumofsquares + (y[i]-mean(y))^2 #total sum of squares
	}
	print (paste("The R-squared value is",
	1-(sumsquaredregression/totalsumofsquares))) #equation for R squared
	}

	#loading data for lalonde
	library(Matching)
	data(lalonde)

	#creating a linear model for lalonde to predict for re78
	lm.lalonde <- lm(re78 ~., data= lalonde)
	yhat <- predict(lm.lalonde)

	#Output r-squared values
	rsquare(lalonde$re78, yhat)
	summary(lm.lalonde)$r.squared
	##Question 4###
	#loading data
	library(foreign)
	maze <- read.dta("mazedata1.dta")

	#setting the treatment to be binary where Caste Revealed is 1 and 0 otherwise
	treat <- ifelse(maze$treatment == "Caste Revealed", 1, 0)
	maze <- data.frame(maze, treat) #appending binary version of treatment to dataframe

	#holds all the round1 values for treatment group
	treated <- maze$round1[which(maze$treat==1)]

	#holds all the round1 values for control group
	control <- maze$round1[which(maze$treat==0)]

	#create storage list
	store <- c()

	#bootstrap the treatment effect for 10,000 samples
	for (x in 1:10000) {
	store[x] <- mean(sample(treated, length(treated), replace = TRUE)) -
	mean(sample(control, length(control), replace = TRUE)) #subtract the control from the treatment
	}

	#store lower and upper boundary of confidence interval of the bootstrapped treatment effects
	confidenceinterval <- quantile(store, c(0.025,0.975))

	#linear model with round1 as dependent variable and treatment as independent variable
	lm.maze = lm(round1 ~ treat, data = maze)

	#computing confidence interval using function based on linear model
	confint <- rbind (confidenceinterval, confint(lm.maze)[2,]) #attach bootstrapped value in one table
	confint

	#create a probability density histogram of treatment effect
	hist(store, xlab = "Treatment Effect", ylab= 'Probability Density', main= 'Probability Density of Treatment Effect', freq=FALSE)
	###QUESTION 5###
	#loading data
	foo <- read.csv(url("https://tinyurl.com/yx8tqf3k"))

	#setting randomness
	set.seed(12345)

	#removing 2000 rows randomly from the original data set
	test_set_rows <- sample(1:length(foo$age), 2000, replace =FALSE) #sampling 2000 index numbers to removes
	test_foo <- foo[-test_set_rows,] #removing the rows from the data set

	#Building the model for surname A-F
	glm.simple <- glm(treat~ age, data = test_foo)
	glm.complex <- glm(treat~. -re78, data = test_foo)

	library(boot)

	##LOOCV
	#using the cross validation function that by default sets the number of folds to 1 for LOOCV
	LOOCV1 = cv.glm(test_foo, glm.simple)$delta[1] #simple model's MSE

	LOOCV2 = cv.glm(test_foo, glm.complex)$delta[1] #complex model's MSE

	##10-fold cross validation
	#using the cross-validation function and setting the number of folds to 10 and averaging the 10 MSEs
	cv.error.simple.10 = rep(0,10) #stores each CV result
	for(i in 1:10) { #loops 10 times to attach each result
	cv.error.simple.10[i]=cv.glm(test_foo, glm.simple, K=10)$delta[1] #performs 10-fold CV
	}
	mean(cv.error.simple.10) #computes mean for simple model

	cv.error.complex.10 = rep(0,10) #stores each CV result
	for(i in 1:10) { #loops 10 times to attach each result
	cv.error.complex.10[i]=cv.glm(test_foo, glm.complex, K=10)$delta[1] #performs 10-fold CV
	}
	mean(cv.error.complex.10) #computes mean for complex model

	##test set error
	#subtracts the actual y value with the predicted y value from each model, squares each result and calculates the average

	mean((foo[test_set_rows,]$treat -predict.glm(glm.simple, newdata=foo[test_set_rows,]))^2) #simple model

	mean((foo[test_set_rows,]$treat -predict.glm(glm.complex, newdata=foo[test_set_rows,]))^2) #complex model
	###QUESTION 6
	trt = matrix(NA,nrow=2,ncol=7)
	ctrl = matrix(NA,nrow=2,ncol=7)

	trt[,1]=c(0, 2) #18
	ctrl[,1]=c(3, 10)
	trt[,2]=c(0, 3) #20
	ctrl[,2]=c(2, 8)
	trt[,3]=c(0, 4) #22
	ctrl[,3]=c(2, 7)
	trt[,4]=c(1, 3) #24
	ctrl[,4]=c(2, 6)
	trt[,5]=c(1, 3) #26
	ctrl[,5]=c(2, 5)
	trt[,6]=c(1, 3) #28
	ctrl[,6]=c(2, 4)
	trt[,7]=c(1, 2) #30
	ctrl[,7]=c(1, 3)


	c1 = rgb(red = 1, green = 0, blue = 0, alpha = 0.5) #trt
	c2 = rgb(red = 0, green = 0, blue = 1, alpha = 0.5) #ctrl

	plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,31), ylim = c(0,11), cex.lab=1.2,
	main = "Alcohol Consumption - 95% Prediction Intervals", xlab = "Age",ylab = "Drinks per Week")

	for (age in seq(from=18,to=30,by=2)) {
	segments(x0 = age-0.05, y0 = trt[1, (age-18)/2+1],
	x1 = age-0.05, y1 = trt[2, (age-18)/2+1],lwd = 3,col=c1)

	segments(x0 = age+0.05, y0 = ctrl[1, (age-18)/2+1],
	x1 = age+0.05, y1 = ctrl[2, (age-18)/2+1],lwd = 3,col=c2)
	}
	legend('topright',legend=c('Treatment','Control'),fill=c(c1,c2))
	mtext("https://tinyurl.com/vwxuwop", side = 1, cex = 0.5, adj = 0, padj = 10)