viniciusmss/gist:1ebd3857fcdf964600e3504a3384eb6b

## gistfile1.txt

##### DO NOT "SELECT ALL" AND EXECUTE THIS CODE ALL AT ONCE!!!
##### EXECUTE LINE-BY-LINE AND THINK ABOUT WHAT EACH LINE IS DOING.
##### RAISE YOUR HAND IF YOU HAVE QUESTIONS.

##### YOU SHOULD BE ABLE TO USE JUPYTER FOR THIS, KERNEL = R (R-PROJECT)

# If you have not done so already, install the "arm" package and load the library

install.packages("arm")

library(arm)

# Load data given in pre-class work: "http://tinyurl.com/z5ygmch"

# RUN THE REGRESSION
sesame <- read.csv(url("https://docs.google.com/spreadsheets/d/e/2PACX-1vQC5enEj91bsrAmXER5z0eC_xlUbe_rhYiXEeTlb90-w1pxbqfmejfRwaCvRZzm201_nSlS-ZG0MN70/pub?gid=1873712132&single=true&output=csv"))

# Note: The ".4" below, (“lm.4”), indicates results for students in grade 4
lm.4 <- lm (post.test ~ treatment + pre.test + I(treatment*pre.test), data = sesame)
lm.4.equivalent <- lm(post.test ~ treatment*pre.test, data = sesame)
lm.4.equivalent2 <- lm(post.test ~ treatment + pre.test + treatment:pre.test, data = sesame)
display (lm.4)
display (lm.4.equivalent) # Just to demonstrate how this syntax works
display (lm.4.equivalent2)# Just to demonstrate how this syntax works

# APPLY THE "SIM" FUNCTION TO SIMULATE THE UNCERTAINTY
lm.4.sim <- sim (lm.4, n.sims = 1000)

(lm.4.sim)@coef # explore the simulated coefficients all equally likely under this model
lm.4.sim@sigma # explore the simulated estimates of fundamental uncertainty
# these are simulations of standard deviations of the error term of the regression
# which (you may recall) is assumed to be distributed via a Normal distribution

# QUESTION: Which coefficients are the key for estimating treatment effects?
# the coefficients for the treatment indicator and interaction term?
# Confirm that they are the SECOND and FOURTH columns, following the logic below...

# According to this model...
# treatment effect = Y(treatment = 1) - Y(treatment = 0)

# treatment effect = (coef(treatment)*treatment + coef(pre.test)*pre.test +
#                     coef(interaction)*treatment*pre.test) when treatment == 1
#                    MINUS
#                    (coef(treatment)*treatment + coef(pre.test)*pre.test +
#                     coef(interaction)*treatment*pre.test) when treatment == 0

# treatment effect = coef(treatment)*1 + coef(pre.test)*pre.test + coef(interaction)*1*pre.test)
#                    MINUS
#                    (coef(treatment)*0 + coef(pre.test)*pre.test + coef(interaction)*0*pre.test)

# treatment effect = coef(treatment) + coef(interaction)*pre.test)


# treatment effect = coef(interaction.term)*pre.test + coef(treatment)

# NOTICE!!!   This is what we worked on together as a class, earlier.
#             This equation can be rewritten as:
#             y = m*x + b  IF y = treatment effect and x = pre.test

# In other words, if you make a figure that plots pre-test on the x axis, and
# treatment effect on the y-axis, coef(treatment) is "b", the intercept, and
# coef(interaction.term) is "m", the slope.

# Let's make a figure like that, reproducing the plot we saw earlier.

# REPRODUCE THE first line in the PLOT we saw earlier
# FIRST JUST THE AXES (no data plotted)
plot (0, 0, xlim=c(80, 120), ylim=c(-5,10),
      xlab="pre-test", ylab="treatment effect",
      main="treatment effect in grade 4")

abline (h = 0, lwd=.5, lty=2) # draws a horizontal line

# "abline" draws a line with intercept "a" and slope "b".
# ###TASK -- DETERMINE WHY WE TAKE coef()[1,2] and coef()[1,4] below!!!
abline (a = coef(lm.4.sim)[1,2], b = coef(lm.4.sim)[1,4],
        lwd = .5, col = "gray")

## Additional exercises...

## EXERCISE 1 (Fill in the "???")

# The line you just drew is associated with the first row in the table of
# simulated coefficients. (This is a partial answer re the task directly above.)
# You could write a "for loop" to draw one line for each
# of the other rows..

for (i in 1:1000) {
  abline (a = coef(lm.4.sim)[i, 2], b = coef(lm.4.sim)[i, 4],
          lwd = .2, col = "gray")
}

## Recover the average treatment effect previously obtained by regression
## by simulating & averaging over the simulated coefficients.  Draw a thicker
## line showing the average treatment effect line.
## Thicken the line with lwd = 3 added as an argument to abline.
## You can also color the line: e.g., color = "purple4"

abline (a = mean(coef(lm.4.sim)[,2]),
        b = mean(coef(lm.4.sim)[,4]),
        lwd = 3, col = "purple4")


## EXERCISE 2
## Estimate standard error of 'treatment' coefficient by calculating the
## standard deviation of the simulated "treatment" coefficients.
sd(coef(lm.4.sim)[,2])
sd(coef(lm.4.sim)[,4])

## EXERCISE 3
## Use confint(lm.4.sim) to get confidence intervals, and then confirm the
## confidence intervals by simulation (using “quantile” function)...
confint(lm.4)
quantile(coef(lm.4.sim)[,2], c(0.025, 0.975))
quantile(coef(lm.4.sim)[,4], c(0.025, 0.975))

## EXERCISE 4
## Estimate 95% intervals for the expected value of the dependent variable using the simulated results.
pre.test.mean <- mean(sesame$pre.test)
treatment.mean <- mean(sesame$treatment)
hyp.unit <- c(1, treatment.mean, pre.test.mean, pre.test.mean*treatment.mean)
pred <- sum(hyp.unit * coef(lm.4.sim)[1,])

preds <- c()
for (i in 1:1000) {
  preds[i] <- sum(hyp.unit * coef(lm.4.sim)[i,])
}
preds
quantile(preds, c(0.025, 0.975))

## EXERCISE 5
## Estimate 95% intervals for the dependent variable using the simulated results.
preds <- c()
for (i in 1:1000) {
  preds[i] <- sum(hyp.unit * coef(lm.4.sim)[i,]) + rnorm(1, 0, lm.4.sim@sigma[i])
}
preds
quantile(preds, c(0.025, 0.975))

# y = coefficients * cov. values + normal(0, std)

## EXERCISE 6
## Calculate RSS and R-squared
display(lm.4)

	##### DO NOT "SELECT ALL" AND EXECUTE THIS CODE ALL AT ONCE!!!
	##### EXECUTE LINE-BY-LINE AND THINK ABOUT WHAT EACH LINE IS DOING.
	##### RAISE YOUR HAND IF YOU HAVE QUESTIONS.

	##### YOU SHOULD BE ABLE TO USE JUPYTER FOR THIS, KERNEL = R (R-PROJECT)

	# If you have not done so already, install the "arm" package and load the library

	install.packages("arm")

	library(arm)

	# Load data given in pre-class work: "http://tinyurl.com/z5ygmch"

	# RUN THE REGRESSION
	sesame <- read.csv(url("https://docs.google.com/spreadsheets/d/e/2PACX-1vQC5enEj91bsrAmXER5z0eC_xlUbe_rhYiXEeTlb90-w1pxbqfmejfRwaCvRZzm201_nSlS-ZG0MN70/pub?gid=1873712132&single=true&output=csv"))

	# Note: The ".4" below, (“lm.4”), indicates results for students in grade 4
	lm.4 <- lm (post.test ~ treatment + pre.test + I(treatment*pre.test), data = sesame)
	lm.4.equivalent <- lm(post.test ~ treatment*pre.test, data = sesame)
	lm.4.equivalent2 <- lm(post.test ~ treatment + pre.test + treatment:pre.test, data = sesame)
	display (lm.4)
	display (lm.4.equivalent) # Just to demonstrate how this syntax works
	display (lm.4.equivalent2)# Just to demonstrate how this syntax works

	# APPLY THE "SIM" FUNCTION TO SIMULATE THE UNCERTAINTY
	lm.4.sim <- sim (lm.4, n.sims = 1000)

	(lm.4.sim)@coef # explore the simulated coefficients all equally likely under this model
	lm.4.sim@sigma # explore the simulated estimates of fundamental uncertainty
	# these are simulations of standard deviations of the error term of the regression
	# which (you may recall) is assumed to be distributed via a Normal distribution

	# QUESTION: Which coefficients are the key for estimating treatment effects?
	# the coefficients for the treatment indicator and interaction term?
	# Confirm that they are the SECOND and FOURTH columns, following the logic below...

	# According to this model...
	# treatment effect = Y(treatment = 1) - Y(treatment = 0)

	# treatment effect = (coef(treatment)treatment + coef(pre.test)pre.test +
	# coef(interaction)treatmentpre.test) when treatment == 1
	# MINUS
	# (coef(treatment)treatment + coef(pre.test)pre.test +
	# coef(interaction)treatmentpre.test) when treatment == 0

	# treatment effect = coef(treatment)1 + coef(pre.test)pre.test + coef(interaction)1pre.test)
	# MINUS
	# (coef(treatment)0 + coef(pre.test)pre.test + coef(interaction)0pre.test)

	# treatment effect = coef(treatment) + coef(interaction)*pre.test)


	# treatment effect = coef(interaction.term)*pre.test + coef(treatment)

	# NOTICE!!! This is what we worked on together as a class, earlier.
	# This equation can be rewritten as:
	# y = m*x + b IF y = treatment effect and x = pre.test

	# In other words, if you make a figure that plots pre-test on the x axis, and
	# treatment effect on the y-axis, coef(treatment) is "b", the intercept, and
	# coef(interaction.term) is "m", the slope.

	# Let's make a figure like that, reproducing the plot we saw earlier.

	# REPRODUCE THE first line in the PLOT we saw earlier
	# FIRST JUST THE AXES (no data plotted)
	plot (0, 0, xlim=c(80, 120), ylim=c(-5,10),
	xlab="pre-test", ylab="treatment effect",
	main="treatment effect in grade 4")

	abline (h = 0, lwd=.5, lty=2) # draws a horizontal line

	# "abline" draws a line with intercept "a" and slope "b".
	# ###TASK -- DETERMINE WHY WE TAKE coef()[1,2] and coef()[1,4] below!!!
	abline (a = coef(lm.4.sim)[1,2], b = coef(lm.4.sim)[1,4],
	lwd = .5, col = "gray")

	## Additional exercises...

	## EXERCISE 1 (Fill in the "???")

	# The line you just drew is associated with the first row in the table of
	# simulated coefficients. (This is a partial answer re the task directly above.)
	# You could write a "for loop" to draw one line for each
	# of the other rows..

	for (i in 1:1000) {
	abline (a = coef(lm.4.sim)[i, 2], b = coef(lm.4.sim)[i, 4],
	lwd = .2, col = "gray")
	}

	## Recover the average treatment effect previously obtained by regression
	## by simulating & averaging over the simulated coefficients. Draw a thicker
	## line showing the average treatment effect line.
	## Thicken the line with lwd = 3 added as an argument to abline.
	## You can also color the line: e.g., color = "purple4"

	abline (a = mean(coef(lm.4.sim)[,2]),
	b = mean(coef(lm.4.sim)[,4]),
	lwd = 3, col = "purple4")


	## EXERCISE 2
	## Estimate standard error of 'treatment' coefficient by calculating the
	## standard deviation of the simulated "treatment" coefficients.
	sd(coef(lm.4.sim)[,2])
	sd(coef(lm.4.sim)[,4])

	## EXERCISE 3
	## Use confint(lm.4.sim) to get confidence intervals, and then confirm the
	## confidence intervals by simulation (using “quantile” function)...
	confint(lm.4)
	quantile(coef(lm.4.sim)[,2], c(0.025, 0.975))
	quantile(coef(lm.4.sim)[,4], c(0.025, 0.975))

	## EXERCISE 4
	## Estimate 95% intervals for the expected value of the dependent variable using the simulated results.
	pre.test.mean <- mean(sesame$pre.test)
	treatment.mean <- mean(sesame$treatment)
	hyp.unit <- c(1, treatment.mean, pre.test.mean, pre.test.mean*treatment.mean)
	pred <- sum(hyp.unit * coef(lm.4.sim)[1,])

	preds <- c()
	for (i in 1:1000) {
	preds[i] <- sum(hyp.unit * coef(lm.4.sim)[i,])
	}
	preds
	quantile(preds, c(0.025, 0.975))

	## EXERCISE 5
	## Estimate 95% intervals for the dependent variable using the simulated results.
	preds <- c()
	for (i in 1:1000) {
	preds[i] <- sum(hyp.unit * coef(lm.4.sim)[i,]) + rnorm(1, 0, lm.4.sim@sigma[i])
	}
	preds
	quantile(preds, c(0.025, 0.975))

	# y = coefficients * cov. values + normal(0, std)

	## EXERCISE 6
	## Calculate RSS and R-squared
	display(lm.4)