drsnyder/ca-wages.R

## ca-wages.R
require(ggplot2)
require(reldist)
require(plyr)
# from http://publicpay.ca.gov/Reports/RawExport.aspx
# mv 2012_StateDepartment.csv 2012_ca_all.csv
# sed '1d' 2012_HigherEd-CAStateUniversity.csv >>  2012_ca_all.csv

options(width=1000)

allca = read.csv("2012_ca_all.csv", header=T)
trimmed = allca[,c("Entity.Name", "Position", "Total.Wages")]
allcawages = data.frame(total=trimmed$Total.Wages)
ninety_nine = unname(quantile(allcawages$total,probs=c(0.99)))[1]
ninety_nine_nine = unname(quantile(allcawages$total,probs=c(0.9999)))[1]


ggplot(allcawages,aes(x=total/1000.0)) +
  stat_ecdf() +
  scale_x_continuous(breaks=seq(0,max(allcawages$total),50)) +
  scale_y_continuous(breaks=seq(0.0,1,0.05)) +
  labs(x="Total Wages (in 1000s)", y="Probability", title="CA State Worker Wages CDF")
ggsave("state-worker-wages-cdf.png")

total = sum(allcawages$total)
lowernn = allcawages[allcawages$total<ninety_nine,]
topone = allcawages[allcawages$total>=ninety_nine,]
print(paste("99.99th ", ninety_nine_nine))
print(paste("total wages ", total, sep=""))
print(paste("top one % ", sum(topone)/total, sep=""))
print(paste("lower 99 % ", sum(lowernn)/total, sep=""))

print(paste("gini ", gini(allcawages$total)))

ggplot(allcawages) +
  geom_histogram(aes(x=total/1000),stat="bin",binwidth=10) +
  scale_x_continuous(breaks=seq(0,max(allcawages$total),50)) +
  scale_y_continuous(breaks=seq(0,75000,10000)) +
  labs(x="Total Wages (in 1000s)", y="Count", title="CA State Worker Wages Histogram")
ggsave("state-worker-wagest-hist.png")

print(head(trimmed[ order(-trimmed[,3]), ], 25))


byentity = ddply(trimmed, .(Entity.Name), summarize, Total=sum(Total.Wages))
byentity = transform(byentity, Entity.Name=reorder(Entity.Name, -Total))
ggplot(head(byentity, 31)) +
  geom_histogram(aes(x=Entity.Name, y=Total/10^9), stat="identity") +
  theme(axis.text.x = element_text(angle=90, hjust=1)) +
  scale_y_continuous(breaks=seq(0,6,0.5)) +
  labs(x="Department", y="Total Wages in Billions", title="Total Wages by Entity") +
  geom_text(data=NULL, x=20, y=2.5, label="Dept. of Corrections is 31 times the Dept. of Education", size=4)
ggsave("state-worker-wages-by-department.png")
	require(ggplot2)
	require(reldist)
	require(plyr)
	# from http://publicpay.ca.gov/Reports/RawExport.aspx
	# mv 2012_StateDepartment.csv 2012_ca_all.csv
	# sed '1d' 2012_HigherEd-CAStateUniversity.csv >> 2012_ca_all.csv

	options(width=1000)

	allca = read.csv("2012_ca_all.csv", header=T)
	trimmed = allca[,c("Entity.Name", "Position", "Total.Wages")]
	allcawages = data.frame(total=trimmed$Total.Wages)
	ninety_nine = unname(quantile(allcawages$total,probs=c(0.99)))[1]
	ninety_nine_nine = unname(quantile(allcawages$total,probs=c(0.9999)))[1]


	ggplot(allcawages,aes(x=total/1000.0)) +
	stat_ecdf() +
	scale_x_continuous(breaks=seq(0,max(allcawages$total),50)) +
	scale_y_continuous(breaks=seq(0.0,1,0.05)) +
	labs(x="Total Wages (in 1000s)", y="Probability", title="CA State Worker Wages CDF")
	ggsave("state-worker-wages-cdf.png")

	total = sum(allcawages$total)
	lowernn = allcawages[allcawages$total<ninety_nine,]
	topone = allcawages[allcawages$total>=ninety_nine,]
	print(paste("99.99th ", ninety_nine_nine))
	print(paste("total wages ", total, sep=""))
	print(paste("top one % ", sum(topone)/total, sep=""))
	print(paste("lower 99 % ", sum(lowernn)/total, sep=""))

	print(paste("gini ", gini(allcawages$total)))

	ggplot(allcawages) +
	geom_histogram(aes(x=total/1000),stat="bin",binwidth=10) +
	scale_x_continuous(breaks=seq(0,max(allcawages$total),50)) +
	scale_y_continuous(breaks=seq(0,75000,10000)) +
	labs(x="Total Wages (in 1000s)", y="Count", title="CA State Worker Wages Histogram")
	ggsave("state-worker-wagest-hist.png")

	print(head(trimmed[ order(-trimmed[,3]), ], 25))


	byentity = ddply(trimmed, .(Entity.Name), summarize, Total=sum(Total.Wages))
	byentity = transform(byentity, Entity.Name=reorder(Entity.Name, -Total))
	ggplot(head(byentity, 31)) +
	geom_histogram(aes(x=Entity.Name, y=Total/10^9), stat="identity") +
	theme(axis.text.x = element_text(angle=90, hjust=1)) +
	scale_y_continuous(breaks=seq(0,6,0.5)) +
	labs(x="Department", y="Total Wages in Billions", title="Total Wages by Entity") +
	geom_text(data=NULL, x=20, y=2.5, label="Dept. of Corrections is 31 times the Dept. of Education", size=4)
	ggsave("state-worker-wages-by-department.png")