Skip to content

Instantly share code, notes, and snippets.

@drsnyder
Created October 3, 2013 18:44
Show Gist options
  • Save drsnyder/6814904 to your computer and use it in GitHub Desktop.
Save drsnyder/6814904 to your computer and use it in GitHub Desktop.
Process CA Wage Data. Data pulled from here http://publicpay.ca.gov/Reports/RawExport.aspx.
require(ggplot2)
require(reldist)
require(plyr)
# from http://publicpay.ca.gov/Reports/RawExport.aspx
# mv 2012_StateDepartment.csv 2012_ca_all.csv
# sed '1d' 2012_HigherEd-CAStateUniversity.csv >> 2012_ca_all.csv
options(width=1000)
allca = read.csv("2012_ca_all.csv", header=T)
trimmed = allca[,c("Entity.Name", "Position", "Total.Wages")]
allcawages = data.frame(total=trimmed$Total.Wages)
ninety_nine = unname(quantile(allcawages$total,probs=c(0.99)))[1]
ninety_nine_nine = unname(quantile(allcawages$total,probs=c(0.9999)))[1]
ggplot(allcawages,aes(x=total/1000.0)) +
stat_ecdf() +
scale_x_continuous(breaks=seq(0,max(allcawages$total),50)) +
scale_y_continuous(breaks=seq(0.0,1,0.05)) +
labs(x="Total Wages (in 1000s)", y="Probability", title="CA State Worker Wages CDF")
ggsave("state-worker-wages-cdf.png")
total = sum(allcawages$total)
lowernn = allcawages[allcawages$total<ninety_nine,]
topone = allcawages[allcawages$total>=ninety_nine,]
print(paste("99.99th ", ninety_nine_nine))
print(paste("total wages ", total, sep=""))
print(paste("top one % ", sum(topone)/total, sep=""))
print(paste("lower 99 % ", sum(lowernn)/total, sep=""))
print(paste("gini ", gini(allcawages$total)))
ggplot(allcawages) +
geom_histogram(aes(x=total/1000),stat="bin",binwidth=10) +
scale_x_continuous(breaks=seq(0,max(allcawages$total),50)) +
scale_y_continuous(breaks=seq(0,75000,10000)) +
labs(x="Total Wages (in 1000s)", y="Count", title="CA State Worker Wages Histogram")
ggsave("state-worker-wagest-hist.png")
print(head(trimmed[ order(-trimmed[,3]), ], 25))
byentity = ddply(trimmed, .(Entity.Name), summarize, Total=sum(Total.Wages))
byentity = transform(byentity, Entity.Name=reorder(Entity.Name, -Total))
ggplot(head(byentity, 31)) +
geom_histogram(aes(x=Entity.Name, y=Total/10^9), stat="identity") +
theme(axis.text.x = element_text(angle=90, hjust=1)) +
scale_y_continuous(breaks=seq(0,6,0.5)) +
labs(x="Department", y="Total Wages in Billions", title="Total Wages by Entity") +
geom_text(data=NULL, x=20, y=2.5, label="Dept. of Corrections is 31 times the Dept. of Education", size=4)
ggsave("state-worker-wages-by-department.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment