Created
October 3, 2013 18:44
-
-
Save drsnyder/6814904 to your computer and use it in GitHub Desktop.
Process CA Wage Data. Data pulled from here http://publicpay.ca.gov/Reports/RawExport.aspx.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(ggplot2) | |
require(reldist) | |
require(plyr) | |
# from http://publicpay.ca.gov/Reports/RawExport.aspx | |
# mv 2012_StateDepartment.csv 2012_ca_all.csv | |
# sed '1d' 2012_HigherEd-CAStateUniversity.csv >> 2012_ca_all.csv | |
options(width=1000) | |
allca = read.csv("2012_ca_all.csv", header=T) | |
trimmed = allca[,c("Entity.Name", "Position", "Total.Wages")] | |
allcawages = data.frame(total=trimmed$Total.Wages) | |
ninety_nine = unname(quantile(allcawages$total,probs=c(0.99)))[1] | |
ninety_nine_nine = unname(quantile(allcawages$total,probs=c(0.9999)))[1] | |
ggplot(allcawages,aes(x=total/1000.0)) + | |
stat_ecdf() + | |
scale_x_continuous(breaks=seq(0,max(allcawages$total),50)) + | |
scale_y_continuous(breaks=seq(0.0,1,0.05)) + | |
labs(x="Total Wages (in 1000s)", y="Probability", title="CA State Worker Wages CDF") | |
ggsave("state-worker-wages-cdf.png") | |
total = sum(allcawages$total) | |
lowernn = allcawages[allcawages$total<ninety_nine,] | |
topone = allcawages[allcawages$total>=ninety_nine,] | |
print(paste("99.99th ", ninety_nine_nine)) | |
print(paste("total wages ", total, sep="")) | |
print(paste("top one % ", sum(topone)/total, sep="")) | |
print(paste("lower 99 % ", sum(lowernn)/total, sep="")) | |
print(paste("gini ", gini(allcawages$total))) | |
ggplot(allcawages) + | |
geom_histogram(aes(x=total/1000),stat="bin",binwidth=10) + | |
scale_x_continuous(breaks=seq(0,max(allcawages$total),50)) + | |
scale_y_continuous(breaks=seq(0,75000,10000)) + | |
labs(x="Total Wages (in 1000s)", y="Count", title="CA State Worker Wages Histogram") | |
ggsave("state-worker-wagest-hist.png") | |
print(head(trimmed[ order(-trimmed[,3]), ], 25)) | |
byentity = ddply(trimmed, .(Entity.Name), summarize, Total=sum(Total.Wages)) | |
byentity = transform(byentity, Entity.Name=reorder(Entity.Name, -Total)) | |
ggplot(head(byentity, 31)) + | |
geom_histogram(aes(x=Entity.Name, y=Total/10^9), stat="identity") + | |
theme(axis.text.x = element_text(angle=90, hjust=1)) + | |
scale_y_continuous(breaks=seq(0,6,0.5)) + | |
labs(x="Department", y="Total Wages in Billions", title="Total Wages by Entity") + | |
geom_text(data=NULL, x=20, y=2.5, label="Dept. of Corrections is 31 times the Dept. of Education", size=4) | |
ggsave("state-worker-wages-by-department.png") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment