Skip to content

Instantly share code, notes, and snippets.

@anglilian
Last active February 22, 2020 17:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anglilian/ed11b3cfd73a31af8fef52d5b7ed7f60 to your computer and use it in GitHub Desktop.
Save anglilian/ed11b3cfd73a31af8fef52d5b7ed7f60 to your computer and use it in GitHub Desktop.
library(ggplot2)
library(arm)
###QUESTION 1###
##PART A##
#Loading data from the CSV file
sesame <- read.csv("https://tinyurl.com/w1g163b")
#Splitting the data into two datasets with control and treatment data
sesame_treat <- subset(sesame, sesame$treatment == 1)
sesame_cont <- subset(sesame, sesame$treatment == 0)
#Creating a linear regression model for treatment and control data
lm_pre_treat <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data = sesame_treat)
lm_pre_cont <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data=sesame_cont)
#Plotting the data for pre test and post test along with the respective regression lines
plot(x = sesame_cont$pre.test, y = sesame_cont$post.test, main = "Grade 4" ,
pch = 16, xlab = "Pre test", ylab = "Post test", xlim= c(0, max(sesame$pre.test)), ylim = c(0,max(sesame$post.test)))
lines (x = sesame_treat$pre.test, y=sesame_treat$post.test, type = "p")
abline(lm_pre_treat)
abline(lm_pre_cont, lty= 'dashed')
##PART B##
sesame_edit <- read.csv("https://tinyurl.com/w1g163b")
#Change point 4 from post test to 45 from 104.6.
sesame_edit[5, 1] = 45
#Splitting the data into two datasets with control and treatment data
sesame_treat_edit <- subset(sesame_edit, sesame_edit$treatment == 1)
sesame_cont_edit <- subset(sesame_edit, sesame_edit$treatment == 0)
#Creating a linear regression model for treatment and control data
lm_treat <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data = sesame_treat_edit)
lm_cont <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data=sesame_cont_edit)
#Plotting the data for pre test and post test along with the respective regression lines
plot(x = sesame_treat_edit$pre.test, y=sesame_treat_edit$post.test, main = "Grade 4" ,
xlab = "Pre test", ylab = "Post test", col=ifelse(sesame_edit$post.test==45.0, "red", "black"), pch=ifelse(sesame_edit$post.test==45.0,16, 1) )
lines (x = sesame_cont_edit$pre.test, y = sesame_cont_edit$post.test,
type = "p", pch = 16
) #changes the colour of the point to red
abline(lm_treat)
abline(lm_cont, lty= 'dashed')
##PART C##
#Create a linear regression model for the post test score
lm.4 <- lm(post.test ~ treatment + pre.test + treatment:pre.test,
data = sesame)
1#simulate the coefficients of the model
lm.4.sim <- sim(lm.4)
lm.4.sim
# creating an empty plot
plot(0, 0, xlim= range(sesame$pre.test), ylim = c(-5,10),
xlab = "Pre-test", ylab = "Treatment effect",
main = "Treatment Effect in Grade 4")
#adds a dotted line along the x-axis
abline (0,0, lwd = .5, lty =2)
#adds 20 lines to the plot using the simulated coefficients
for (i in 1:20){
curve(lm.4.sim@coef[i,2]+lm.4.sim@coef[i,4]*x,
lwd=.5, col="gray", add= TRUE)}
#adds a line that is the difference between the two regression lines in the previous plot
curve(coef(lm.4)[2]+coef(lm.4)[4]*x, lwd=.5, add=TRUE)
#histogram of pre test scores
hist(sesame$pre.test)
###QUESTION 2###
tinting = read.csv(url("https://tinyurl.com/v4bq99k")) #loading data
lm.tint <- lm(csoa~ age + sex + target + I(tint!="no") +
I(as.numeric(tint!="no")*age), data = tinting) #setting the linear model
#Simulating 95% interval and mean of csoa
set.seed(123) #setting randomness
iteration <- 1000 #number of simulations to run
sim_treated <-sim(lm.tint, n.sims=iteration) #running the simulations 1000 times on linear model
agevector <- c(20,30,40,50,60,70,80)
storage.matrix <- matrix(NA, nrow=1000, ncol = 7) #creating a matrix to store the 1000 csoa for each age
for (age in agevector){ #iterating through each age in age vector
for (i in 1:1000) { #1000 times for each age
treated <- c(1, age, 0, 0, TRUE, as.numeric(TRUE)*age) #setting the variables as stipulated
storage.matrix[i, age/10-1] <-sum(treated*sim_treated@coef[i,]) #multiplying each variable by its simulated coefficient, summing it and storing it in a matrix as the predicted csoa
}
}
quantile(storage.matrix[,7], c(0.025,0.975))
#Creating a table of results
means <- apply(storage.matrix, 2, mean) #computing mean of each age
conf.intervals <- apply(storage.matrix, 2, quantile, probs = c(0.025, 0.975)) #computing 2.5 and 97.5 percentile of each age
table_ages <- rbind(means, conf.intervals) #putting data together in one table
colnames(table_ages) <- c(20,30,40,50,60,70,80) #change column names to appropriate age
table_ages
#Plotting results onto a graph
plot(x = c(1:100), xlim= c(20,80) , ylim = c(30,70), type = "n", xlab = "Age (Years)",
ylab = "CSOA", main= "Expected Values of Treated Females")
for (age in agevector) {
segments( #plots the confidence intervals of each age
x0 = age,
y0 = conf.intervals[1, age/10-1],
x1 = age,
y1 = conf.intervals[2, age/10-1],
lwd = 2)
}
lines(agevector, means, pch=16, col='red', type='p') #plot the means of each age
##PART B##
storage.matrix.cont <- matrix(NA, nrow=1000, ncol = 7) #creating a matrix to store the 1000 csoa for each age
#Simulating csoa values for control group
for (age in agevector){ #iterating through each age in age vector
for (i in 1:1000) { #1000 times for each age
untreated <- c(1, age, 0, 0, 0 , as.numeric(FALSE)*age) #setting the variables as stipulated
storage.matrix.cont[i, age/10-1] <-sum(untreated*sim_treated@coef[i,]) #multiplying each variable by its simulated coefficient, summing it and storing it in a matrix as the predicted csoa
}
}
#calculating the treatment effect for each entry by subtracting the treated from the controlled
storage.treat <- storage.matrix - storage.matrix.cont
#Creating a table of results
means.treat <-apply(storage.treat, 2, mean) #computing mean of each age
conf.intervals.treat <- apply(storage.treat, 2, quantile, probs = c(0.025, 0.975)) #computing 2.5 and 97.5 percentile of each age
table_ages.treat <- rbind(means.treat, conf.intervals.treat) #putting data together in one table
colnames(table_ages.treat) <- c(20,30,40,50,60,70,80) #change column names to appropriate age
table_ages.treat
#Plotting results onto a graph
plot(x = c(1:100), xlim= c(20,80) , ylim = c(-10,10), type = "n", xlab = "Age (Years)",
ylab = "CSOA", main= "Treatment Effect of Females")
for (age in agevector) {
segments(
x0 = age,
y0 = conf.intervals.treat[1, age/10-1],
x1 = age,
y1 = conf.intervals.treat[2, age/10-1],
lwd = 2)
}
lines(agevector, means.treat, pch=16, col='red', type='p') #plotting each of the means
###QUESTION 3###
#Function for R squared that takes the two arguments: y and the predicted y
rsquare <- function(y, yhat) {
sumsquaredregression <- 0
totalsumofsquares <- 0
for (i in 1:length(y)) {
sumsquaredregression <- sumsquaredregression + (y[i]-yhat[i])^2 #sum of squared regression
totalsumofsquares <- totalsumofsquares + (y[i]-mean(y))^2 #total sum of squares
}
print (paste("The R-squared value is",
1-(sumsquaredregression/totalsumofsquares))) #equation for R squared
}
#loading data for lalonde
library(Matching)
data(lalonde)
#creating a linear model for lalonde to predict for re78
lm.lalonde <- lm(re78 ~., data= lalonde)
yhat <- predict(lm.lalonde)
#Output r-squared values
rsquare(lalonde$re78, yhat)
summary(lm.lalonde)$r.squared
##Question 4###
#loading data
library(foreign)
maze <- read.dta("mazedata1.dta")
#setting the treatment to be binary where Caste Revealed is 1 and 0 otherwise
treat <- ifelse(maze$treatment == "Caste Revealed", 1, 0)
maze <- data.frame(maze, treat) #appending binary version of treatment to dataframe
#holds all the round1 values for treatment group
treated <- maze$round1[which(maze$treat==1)]
#holds all the round1 values for control group
control <- maze$round1[which(maze$treat==0)]
#create storage list
store <- c()
#bootstrap the treatment effect for 10,000 samples
for (x in 1:10000) {
store[x] <- mean(sample(treated, length(treated), replace = TRUE)) -
mean(sample(control, length(control), replace = TRUE)) #subtract the control from the treatment
}
#store lower and upper boundary of confidence interval of the bootstrapped treatment effects
confidenceinterval <- quantile(store, c(0.025,0.975))
#linear model with round1 as dependent variable and treatment as independent variable
lm.maze = lm(round1 ~ treat, data = maze)
#computing confidence interval using function based on linear model
confint <- rbind (confidenceinterval, confint(lm.maze)[2,]) #attach bootstrapped value in one table
confint
#create a probability density histogram of treatment effect
hist(store, xlab = "Treatment Effect", ylab= 'Probability Density', main= 'Probability Density of Treatment Effect', freq=FALSE)
###QUESTION 5###
#loading data
foo <- read.csv(url("https://tinyurl.com/yx8tqf3k"))
#setting randomness
set.seed(12345)
#removing 2000 rows randomly from the original data set
test_set_rows <- sample(1:length(foo$age), 2000, replace =FALSE) #sampling 2000 index numbers to removes
test_foo <- foo[-test_set_rows,] #removing the rows from the data set
#Building the model for surname A-F
glm.simple <- glm(treat~ age, data = test_foo)
glm.complex <- glm(treat~. -re78, data = test_foo)
library(boot)
##LOOCV
#using the cross validation function that by default sets the number of folds to 1 for LOOCV
LOOCV1 = cv.glm(test_foo, glm.simple)$delta[1] #simple model's MSE
LOOCV2 = cv.glm(test_foo, glm.complex)$delta[1] #complex model's MSE
##10-fold cross validation
#using the cross-validation function and setting the number of folds to 10 and averaging the 10 MSEs
cv.error.simple.10 = rep(0,10) #stores each CV result
for(i in 1:10) { #loops 10 times to attach each result
cv.error.simple.10[i]=cv.glm(test_foo, glm.simple, K=10)$delta[1] #performs 10-fold CV
}
mean(cv.error.simple.10) #computes mean for simple model
cv.error.complex.10 = rep(0,10) #stores each CV result
for(i in 1:10) { #loops 10 times to attach each result
cv.error.complex.10[i]=cv.glm(test_foo, glm.complex, K=10)$delta[1] #performs 10-fold CV
}
mean(cv.error.complex.10) #computes mean for complex model
##test set error
#subtracts the actual y value with the predicted y value from each model, squares each result and calculates the average
mean((foo[test_set_rows,]$treat -predict.glm(glm.simple, newdata=foo[test_set_rows,]))^2) #simple model
mean((foo[test_set_rows,]$treat -predict.glm(glm.complex, newdata=foo[test_set_rows,]))^2) #complex model
###QUESTION 6
trt = matrix(NA,nrow=2,ncol=7)
ctrl = matrix(NA,nrow=2,ncol=7)
trt[,1]=c(0, 2) #18
ctrl[,1]=c(3, 10)
trt[,2]=c(0, 3) #20
ctrl[,2]=c(2, 8)
trt[,3]=c(0, 4) #22
ctrl[,3]=c(2, 7)
trt[,4]=c(1, 3) #24
ctrl[,4]=c(2, 6)
trt[,5]=c(1, 3) #26
ctrl[,5]=c(2, 5)
trt[,6]=c(1, 3) #28
ctrl[,6]=c(2, 4)
trt[,7]=c(1, 2) #30
ctrl[,7]=c(1, 3)
c1 = rgb(red = 1, green = 0, blue = 0, alpha = 0.5) #trt
c2 = rgb(red = 0, green = 0, blue = 1, alpha = 0.5) #ctrl
plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,31), ylim = c(0,11), cex.lab=1.2,
main = "Alcohol Consumption - 95% Prediction Intervals", xlab = "Age",ylab = "Drinks per Week")
for (age in seq(from=18,to=30,by=2)) {
segments(x0 = age-0.05, y0 = trt[1, (age-18)/2+1],
x1 = age-0.05, y1 = trt[2, (age-18)/2+1],lwd = 3,col=c1)
segments(x0 = age+0.05, y0 = ctrl[1, (age-18)/2+1],
x1 = age+0.05, y1 = ctrl[2, (age-18)/2+1],lwd = 3,col=c2)
}
legend('topright',legend=c('Treatment','Control'),fill=c(c1,c2))
mtext("https://tinyurl.com/vwxuwop", side = 1, cex = 0.5, adj = 0, padj = 10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment