carbocation/nyc_teachers.R

## nyc_teachers.R
path <- "/Users/jpirruccello/Downloads"

library(ggplot2)
library(mboost)

#Load the data
y09 <- read.csv(paste(path,"TDI_20082009_FOIL_Press_JP.txt",sep="/"),sep="\t")
y10 <- read.csv(paste(path,"TDI_20092010_FOIL_Press_JP.txt",sep="/"),sep="\t")

#Identify teachers by the following, and make a merged dataframe based on the uniqueness thereof:
identifiers <- c("subject","grade","teacher_name_first_1","teacher_name_last_1")
df <- merge(y09, y10, by=identifiers)

#Keep only solo teacher records, ignoring co-teaching:
df <- subset(df, teacher_name_first_2.x == "" & teacher_name_first_2.y == "")

#4th grade only
df <- subset(df, grade == "4th Grade")

scores <- c("va_0809.x","va_0910")
df$va_diff_09_10 <- df[,scores[2]] - df[,scores[1]]

#Model using GLM's Gaussian linear regressor
model <- glm(va_0910 ~ va_0809.x,data=df)
x <- data.frame(pred=predict(model, df), real=df$va_0910)
x$diff <- x$real - x$pred
x$diffsq <- unlist(lapply(x[,"diff"],function(z){z^2}))
apply(x,2,sum)

#Model using additive boosting
m1 <- gamboost(va_0910 ~ bbs(va_0809.x) + bbs(n_0910) + bbs(pretest_0910), data=df, control = boost_control(mstop = 500))
x1 <- data.frame(pred=predict(m1, df), real=df$va_0910)
x1$diff <- x1$real - x1$pred
x1$diffsq <- unlist(lapply(x1[,"diff"],function(z){z^2}))
apply(x1,2,sum)

m2 <- gamboost(va_0910 ~ bbs(va_0809.x) + bbs(n_0910) + bbs(pretest_0910) + bbs(predicted_0910), data=df, control = boost_control(mstop = 500))
x2 <- data.frame(pred=predict(m2, df), real=df$va_0910)
x2$diff <- x2$real - x2$pred
x2$diffsq <- unlist(lapply(x2[,"diff"],function(z){z^2}))
apply(x2,2,sum)

ggplot(df,aes(x=df$va_0809.x,y=df$va_0910))+geom_density2d()

#From http://stackoverflow.com/questions/7073315/how-do-i-create-a-continuous-density-heatmap-of-2d-scatter-data-in-r

ggplot(df,aes(x=df$va_0809.x,y=df$va_0910))+
  stat_density2d(aes(alpha=..level..), geom="tile") +
  scale_alpha_continuous(limits=c(0,0.2),breaks=seq(0,0.2,by=0.025))+
  geom_point(colour="red",alpha=0.01) +
  theme_bw()

ggplot(df,aes(x=df$va_0809.x,y=df$va_0910))+
    stat_density2d(aes(fill=..level..), geom="polygon") +
    scale_fill_gradient(low="blue", high="green")
	path <- "/Users/jpirruccello/Downloads"

	library(ggplot2)
	library(mboost)

	#Load the data
	y09 <- read.csv(paste(path,"TDI_20082009_FOIL_Press_JP.txt",sep="/"),sep="\t")
	y10 <- read.csv(paste(path,"TDI_20092010_FOIL_Press_JP.txt",sep="/"),sep="\t")

	#Identify teachers by the following, and make a merged dataframe based on the uniqueness thereof:
	identifiers <- c("subject","grade","teacher_name_first_1","teacher_name_last_1")
	df <- merge(y09, y10, by=identifiers)

	#Keep only solo teacher records, ignoring co-teaching:
	df <- subset(df, teacher_name_first_2.x == "" & teacher_name_first_2.y == "")

	#4th grade only
	df <- subset(df, grade == "4th Grade")

	scores <- c("va_0809.x","va_0910")
	df$va_diff_09_10 <- df[,scores[2]] - df[,scores[1]]

	#Model using GLM's Gaussian linear regressor
	model <- glm(va_0910 ~ va_0809.x,data=df)
	x <- data.frame(pred=predict(model, df), real=df$va_0910)
	x$diff <- x$real - x$pred
	x$diffsq <- unlist(lapply(x[,"diff"],function(z){z^2}))
	apply(x,2,sum)

	#Model using additive boosting
	m1 <- gamboost(va_0910 ~ bbs(va_0809.x) + bbs(n_0910) + bbs(pretest_0910), data=df, control = boost_control(mstop = 500))
	x1 <- data.frame(pred=predict(m1, df), real=df$va_0910)
	x1$diff <- x1$real - x1$pred
	x1$diffsq <- unlist(lapply(x1[,"diff"],function(z){z^2}))
	apply(x1,2,sum)

	m2 <- gamboost(va_0910 ~ bbs(va_0809.x) + bbs(n_0910) + bbs(pretest_0910) + bbs(predicted_0910), data=df, control = boost_control(mstop = 500))
	x2 <- data.frame(pred=predict(m2, df), real=df$va_0910)
	x2$diff <- x2$real - x2$pred
	x2$diffsq <- unlist(lapply(x2[,"diff"],function(z){z^2}))
	apply(x2,2,sum)

	ggplot(df,aes(x=df$va_0809.x,y=df$va_0910))+geom_density2d()

	#From http://stackoverflow.com/questions/7073315/how-do-i-create-a-continuous-density-heatmap-of-2d-scatter-data-in-r

	ggplot(df,aes(x=df$va_0809.x,y=df$va_0910))+
	stat_density2d(aes(alpha=..level..), geom="tile") +
	scale_alpha_continuous(limits=c(0,0.2),breaks=seq(0,0.2,by=0.025))+
	geom_point(colour="red",alpha=0.01) +
	theme_bw()

	ggplot(df,aes(x=df$va_0809.x,y=df$va_0910))+
	stat_density2d(aes(fill=..level..), geom="polygon") +
	scale_fill_gradient(low="blue", high="green")