Skip to content

Instantly share code, notes, and snippets.

@RandomCriticalAnalysis
Last active May 31, 2016 21:41
Show Gist options
  • Save RandomCriticalAnalysis/69ca6628df5ccae25fe25fce8d71450c to your computer and use it in GitHub Desktop.
Save RandomCriticalAnalysis/69ca6628df5ccae25fe25fce8d71450c to your computer and use it in GitHub Desktop.
for Anatoly (relevant code for education analysis)
# test scores and most covariates taken from
# https://cepa.stanford.edu/seda/download?nid=1727&destination=node/1717
# parents average years of education computed using data from census ACS (via dept of edu EDGE service)
# http://nces.ed.gov/programs/edge/demographicACS.aspx
pm = read.csv("cepa_pooled_means.csv",stringsAsFactors = F)
cve = read.csv("cepa_covariates_from_excel.csv",stringsAsFactors = F)
glarge = merge(cve,pm,by.x='leaid',by.y='nces_district_id')
ed = read.csv("nces_parent_ed_levels.csv",stringsAsFactors=F)
ed$nces_district_id = as.integer(substr(ed$GeoId,8,20))
ed$avg_years_education = (ed$ed_less_than_9th_grade/100 * 8) + (ed$ed_9th_12th_grade/100 * 10) + (ed$ed_hs_grad /100 * 12) + (ed$ed_some_college/100 * 13) +
(ed$ed_assoc_degree/100 * 14) + (ed$ed_bach_degree /100 * 16) + (ed$ed_grad_prof_degree/100 * 20)
glarge2 = merge(glarge,ed,by.x='leaid',by.y='nces_district_id')
# copying variables for ease of typing (shorter)
glarge2$ba_all = glarge2$X..of.adults.with.ba...all.
glarge2$ba_wht = glarge2$X..of.adults.with.ba...wht.
glarge2$ba_blk = glarge2$X..of.adults.with.ba...blk.
glarge2$ba_hsp = glarge2$X..of.adults.with.ba...hsp.
glarge2$pct_wht = glarge2$percent.whites.in.the.district
glarge2$pct_blk = glarge2$percent.blacks.in.the.district
glarge2$pct_hsp = glarge2$percent.hispanics.in.the.district
glarge2$pct_asn = glarge2$percent.asians.in.the.district
glarge2$percent_na = glarge2$percent.native.americans.in.the.district
glarge2$spop = glarge2$Total.Enrollment..Grades.3.8
glarge2$pct_urm = glarge2$pct_hsp + glarge2$pct_blk + glarge2$percent_na
glarge2$score_SIRE_ed = predict(lm(pooled_score ~ pct_wht + pct_blk + pct_hsp + pct_asn+ percent_na + avg_years_education,data=glarge2,na.action=na.exclude,weights=glarge2$spop))
ctd = subset(glarge2, LEA.Name %in% c('DARIEN SCHOOL DISTRICT','BRIDGEPORT SCHOOL DISTRICT','GREENWICH SCHOOL DISTRICT') & State.Abbreviation == 'CT')
# compute correlation coefficients
wtd.cor(glarge2$score_SIRE_ed,glarge2$pooled_score,glarge2$spop)
cor(glarge2$score_SIRE_ed,glarge2$pooled_score)
ggplot(glarge2,aes(score_SIRE_ed,pooled_score)) +
geom_point(color='blue',alpha=0.2) + geom_smooth() +
geom_point(data=ctd,color='green',aes(size=spop)) +
geom_label_repel(data=ctd,aes(label=LEA.Name)) +
xlab('predicted district test scores\nusing race/ethnicity and estimated average years of education of adults, 25+ years old\n with OLS, linear terms, no interactions, etc') +
ylab('actual test scores') +
annotate("text",x=-.65,y=1,size=8,label="(weighted) r = .89") +
annotate("text",x=-.7,y=.8,size=8,label="(unweighted) r = .83")
ct_only = subset(glarge2,State.Abbreviation == 'CT')
library(scales)
ggplot(ct_only,aes(Total.PP.Expenditures..Tot.Exp.Enrl,pooled_score)) +
geom_point(color='blue',alpha=0.4,aes(size=spop)) + geom_smooth() +
geom_point(data=ctd,color='green',aes(size=spop)) +
geom_label_repel(data=ctd,aes(label=LEA.Name)) +
scale_x_continuous(labels=dollar) +
xlab('total expenditures per pupil') +
ylab('actual test scores') +
ggtitle('CT school districts test scores by total expenditures per pupil')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment