albert-decatur/a eVA procurements, cost by agency by date

## a eVA procurements, cost by agency by date
# for every agency, get name,isodates,cost
# cost is just units*unit price
# ignore cost if it's <1 USD
in=eVA_healthyRecords_2015-07-12.tsv
while read agency; do cat $in | tawk "{if(\$2 ~ /\"${agency}\"/ && \$5*\$6 > 1)print \$2,\$3,\$6*\$5}" | sed 's:"::g' | sed 's:\t\([0-9]\{2\}\)\/\([0-9]\{2\}\)\/\([0-9]\{4\}\)\t:\t\3-\1-\2\t:g'; done < /tmp/agencies | sed '1 i\agency\tdate\tamount'  >/tmp/eVA_costByDateByAgency.txt
# next take it into a db and group by date
~/git/aiddata-utils/etl/txt2pgsql.pl -i /tmp/eVA_rva.tsv -d "\t" -t "TEXT" -p del | sh
# group costs by agency,date
echo "copy ( select agency,date,sum(amount::numeric) as cost from \"eVA_rva\" group by agency,date ) to stdout with csv header;" | psql del > /tmp/eVA_rva.csv

## b eVA cumulative cost by date by agency
# cumulative cost of procurement by agency over time
# where fields are 1) agency name, 2) date order asc, 3) cost
library(ggplot2)
# to transform data
library(plyr)
# needed for commas in y axis breaks
library(scales)
# to label log scale axis with sensible ticks
df <- read.csv("extras/vcu_v_publicSchools.csv")
df.t <- ddply(df,.(agency),transform,ccost=cumsum(cost))
# make breaks for plot - these will follow log scale but in plain notation
breaks = 10**(1:10)
# plot cumulative cost over time by agency
# use log axis for y
# but breaks will not be in sci notation
# NB: casting date fields as.Date works wonders for axis labels, plot background, etc!
ggplot(df.t, aes(x=as.Date(date), y=ccost, colour=agency, group=agency)) + geom_line() + coord_trans(y = "log10") + labs(x="date",y="cumulative spending, USD, (log scale)",title="title") + scale_y_log10(breaks = breaks, labels = comma(breaks))

## c find years with no data for agency
in=eVA_yearsWithDataByAgency.tsv; cat eVA_totalCostByAgency.tsv |head -n 26 | c 1 | sed '1d' | tac | parallel 'yrs_avail=$(mawk -F"\t" "{if(\$1 ~ /^$(echo {})$/)print \$0}" '$in' | cut -f 2); yrs_not_avail=$(hash_diff.pl -a <(echo "$yrs_avail") -b <(seq 2001 2015) | grep lista | grep -oE "^[^:]*" ); echo "$yrs_not_avail" | sed "s:^:$(echo {})\t:g"' | sed '1 i\agency\tyear_unavilable' > /tmp/foo.tsv

## d plot missing eVA data by agency year
# read in data with these conditions:
# vars are agency names, year with missing data, total agency cost (just for scaling point geoms)
# sorted by agency total cost asc
# sorted asc b/c coord_flip() flips that value
df <- read.csv("/tmp/foo.csv",sep="\t")
library(ggplot2)
# make sure to put this in the order the data appears,which is by total cost FY2001-2015
df$agency <- factor(df$agency, levels=unique(df$agency))
# plot with no legend, right justified title, use black outlines for minor panel grid, use tiles as geom and color by year
ggplot(df,aes(x=df$agency,y=df$year)) + coord_flip() + labs(x="eVA agency",y="year",title="Years with No Data by Agency\nState of Virginia Procurement, eVA FY2001-2015") + theme(plot.title = element_text(hjust=1)) + theme(legend.position="none") + geom_tile(fill=df$year) + theme(panel.grid.minor = element_line(colour = "black"))

## e heatmap of number of procurements by month
# read.csv a table with agency name, count of procurements, month (needs to be YYYY-MM-DD for this example but you can just set them all to the first)
png("/tmp/foo.png",unit="in",res=600,height=6,width=12);ggplot(df,aes(x=df$agency,y=df$month)) + coord_flip() + labs(x="eVA agency",y="year",title="Number of Procurements By Month,\neVA FY2001-2015\n(Top 25 Agencies by Spending)") + theme(legend.position="none") + scale_y_date(breaks = date_breaks("years"),labels = date_format("%Y")) + theme(axis.text.x = element_text(angle=290,vjust=.4)) + theme(panel.background = element_blank()) + geom_tile(aes(fill=df$count_po));dev.off()

## f get agency, month, monthly spend
# given a table of agency, date, cost
# produce a table of agency, month,procurement count, USD spent on procurement
# NB: none of this is inflation adjusted

in=eVA_costByDateByAgency.tsv; cat eVA_totalCostByAgency.tsv |head -n 26 | c 1 | sed '1d' | tac | parallel 'agency_records=$( mawk -F"\t" "{if(\$1 ~ /^$(echo {})$/)print \$0}" '$in'); byMonth=$(echo "$agency_records" | cut -f2 | sed "s:...$::g" ); countByMonth=$( echo "$byMonth" | sort | uniq -c | sed "s:^\s\+::g;s:\s:\t:g" ); echo "$byMonth" | sort | uniq | while read month; do count=$( echo "$countByMonth" | grep -E "\b${month}\b" | cut -f1 ); monthly_cost=$( echo "$agency_records" | mawk -F"\t" "{if(\$2 ~ /^${month}/)print \$3}" | awk "{ sum += \$1 } END { printf \"%.4f\n\", sum }" ); echo -e {}"\t$(echo $month | sed "s:$:-01:g")\t$count\t$monthly_cost"; done' | sed '1 i\agency\tmonth\tpo_count\tpo_spend'

## g from monthly count and spend get annual count and spend
in=eVA_costByMonthByAgency.tsv; cat $in | sed '1d' | cut -f1 | sort | uniq | parallel 'records=$(mawk -F"\t" "{if(\$1 ~ /^$(echo {})$/)print \$0}" '$in' ); echo "$records"|mawk -F"\t" "{gsub(/-[0-9]+-[0-9]+$/,\"\",\$2);print \$2}" | sort | uniq | while read year; do agency_yr=$( echo "$records" | mawk -F"\t" "{if(\$2 ~ /^${year}-/)print \$0}" ); count_po_yr=$( echo "$agency_yr" | cut -f 3 | awk "{ sum += \$1 } END { printf \"%.0f\n\", sum }" ); spend_yr=$( echo "$agency_yr" | cut -f 4 | awk "{ sum += \$1 } END { printf \"%.0f\n\", sum }" ); echo -e {}"\t$year\t$count_po_yr\t$spend_yr"; done'|sed '1 i\agency\tyear\tpo_count_yr\tpo_spend_yr'

## h heatmap of spending, ddply for transform to rescale by agency
# load spending by month with fields:
# agency name, month, spend for month, year, count of po
df <- read.csv("extras/eVA_costByMonthByAgency.tsv",sep="\t")
# load libraries
library(plyr) # for transform
library(ggplot2)
# date field as type date
df$date <- as.Date(df$date)
# get po_spend rescaled by agency/year
df.t <- ddply(df,.(agency,year),transform,rescaled_po_spend=rescale(po_spend))

png("/tmp/foo.png",unit="in",res=300,height=6,width=12);ggplot(df.t,aes(x=df.t$agency,y=df.t$month)) + coord_flip() + labs(x="agency",y="year") + theme(legend.position="none") + scale_y_date(breaks = date_breaks("years"),labels = date_format("%Y")) + theme(axis.text.x = element_text(angle=290,vjust=.4)) + scale_fill_gradient(low="white",high="steelblue") + theme(panel.grid = element_blank()) + geom_tile(aes(fill=df.t$rescaled_po_spend),color="grey",linestyle="dotted") + ggtitle(expression(atop("High Procurement Spending Months by Agency/Year\nState of Virginia, eVA FY2001-2015", atop(italic("top 25 agencies by total procurement spending. data: bit.ly/eVA-byMonth"), "")))) + theme(plot.title = element_text(hjust=0,vjust=0));dev.off()

## i label by month, change title

      
    Raw
  

              i label by month, change title
	# for every agency, get name,isodates,cost
	# cost is just units*unit price
	# ignore cost if it's <1 USD
	in=eVA_healthyRecords_2015-07-12.tsv
	while read agency; do cat $in \| tawk "{if(\$2 ~ /\"${agency}\"/ && \$5\$6 > 1)print \$2,\$3,\$6\$5}" \| sed 's:"::g' \| sed 's:\t\([0-9]\{2\}\)\/\([0-9]\{2\}\)\/\([0-9]\{4\}\)\t:\t\3-\1-\2\t:g'; done < /tmp/agencies \| sed '1 i\agency\tdate\tamount' >/tmp/eVA_costByDateByAgency.txt
	# next take it into a db and group by date
	~/git/aiddata-utils/etl/txt2pgsql.pl -i /tmp/eVA_rva.tsv -d "\t" -t "TEXT" -p del \| sh
	# group costs by agency,date
	echo "copy ( select agency,date,sum(amount::numeric) as cost from \"eVA_rva\" group by agency,date ) to stdout with csv header;" \| psql del > /tmp/eVA_rva.csv
	# cumulative cost of procurement by agency over time
	# where fields are 1) agency name, 2) date order asc, 3) cost
	library(ggplot2)
	# to transform data
	library(plyr)
	# needed for commas in y axis breaks
	library(scales)
	# to label log scale axis with sensible ticks
	df <- read.csv("extras/vcu_v_publicSchools.csv")
	df.t <- ddply(df,.(agency),transform,ccost=cumsum(cost))
	# make breaks for plot - these will follow log scale but in plain notation
	breaks = 10**(1:10)
	# plot cumulative cost over time by agency
	# use log axis for y
	# but breaks will not be in sci notation
	# NB: casting date fields as.Date works wonders for axis labels, plot background, etc!
	ggplot(df.t, aes(x=as.Date(date), y=ccost, colour=agency, group=agency)) + geom_line() + coord_trans(y = "log10") + labs(x="date",y="cumulative spending, USD, (log scale)",title="title") + scale_y_log10(breaks = breaks, labels = comma(breaks))
	# read in data with these conditions:
	# vars are agency names, year with missing data, total agency cost (just for scaling point geoms)
	# sorted by agency total cost asc
	# sorted asc b/c coord_flip() flips that value
	df <- read.csv("/tmp/foo.csv",sep="\t")
	library(ggplot2)
	# make sure to put this in the order the data appears,which is by total cost FY2001-2015
	df$agency <- factor(df$agency, levels=unique(df$agency))
	# plot with no legend, right justified title, use black outlines for minor panel grid, use tiles as geom and color by year
	ggplot(df,aes(x=df$agency,y=df$year)) + coord_flip() + labs(x="eVA agency",y="year",title="Years with No Data by Agency\nState of Virginia Procurement, eVA FY2001-2015") + theme(plot.title = element_text(hjust=1)) + theme(legend.position="none") + geom_tile(fill=df$year) + theme(panel.grid.minor = element_line(colour = "black"))
	# given a table of agency, date, cost
	# produce a table of agency, month,procurement count, USD spent on procurement
	# NB: none of this is inflation adjusted

	in=eVA_costByDateByAgency.tsv; cat eVA_totalCostByAgency.tsv \|head -n 26 \| c 1 \| sed '1d' \| tac \| parallel 'agency_records=$( mawk -F"\t" "{if(\$1 ~ /^$(echo {})$/)print \$0}" '$in'); byMonth=$(echo "$agency_records" \| cut -f2 \| sed "s:...$::g" ); countByMonth=$( echo "$byMonth" \| sort \| uniq -c \| sed "s:^\s\+::g;s:\s:\t:g" ); echo "$byMonth" \| sort \| uniq \| while read month; do count=$( echo "$countByMonth" \| grep -E "\b${month}\b" \| cut -f1 ); monthly_cost=$( echo "$agency_records" \| mawk -F"\t" "{if(\$2 ~ /^${month}/)print \$3}" \| awk "{ sum += \$1 } END { printf \"%.4f\n\", sum }" ); echo -e {}"\t$(echo $month \| sed "s:$:-01:g")\t$count\t$monthly_cost"; done' \| sed '1 i\agency\tmonth\tpo_count\tpo_spend'
	# load spending by month with fields:
	# agency name, month, spend for month, year, count of po
	df <- read.csv("extras/eVA_costByMonthByAgency.tsv",sep="\t")
	# load libraries
	library(plyr) # for transform
	library(ggplot2)
	# date field as type date
	df$date <- as.Date(df$date)
	# get po_spend rescaled by agency/year
	df.t <- ddply(df,.(agency,year),transform,rescaled_po_spend=rescale(po_spend))

	png("/tmp/foo.png",unit="in",res=300,height=6,width=12);ggplot(df.t,aes(x=df.t$agency,y=df.t$month)) + coord_flip() + labs(x="agency",y="year") + theme(legend.position="none") + scale_y_date(breaks = date_breaks("years"),labels = date_format("%Y")) + theme(axis.text.x = element_text(angle=290,vjust=.4)) + scale_fill_gradient(low="white",high="steelblue") + theme(panel.grid = element_blank()) + geom_tile(aes(fill=df.t$rescaled_po_spend),color="grey",linestyle="dotted") + ggtitle(expression(atop("High Procurement Spending Months by Agency/Year\nState of Virginia, eVA FY2001-2015", atop(italic("top 25 agencies by total procurement spending. data: bit.ly/eVA-byMonth"), "")))) + theme(plot.title = element_text(hjust=0,vjust=0));dev.off()