Created
July 12, 2013 06:51
-
-
Save abelsonlive/5982440 to your computer and use it in GitHub Desktop.
munge treasury.io data for table_ii into data-sciencey format.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load in libraries | |
library("lubridate") | |
library("plyr") | |
library("RColorBrewer") | |
# where is your data located, lets switch to that dir! | |
PATH_TO_DATA_DIR <- "~/Dropbox/code/federal-treasury-api/data/lifetime_csv" | |
setwd(PATH_TO_DATA_DIR) | |
# read in csv. make sure to have stringsAsFactors=FALSE | |
df <- read.csv("table_ii.csv", stringsAsFactors=FALSE) | |
# take a specific set of columns | |
# here the select arg takes an array, | |
# specified by c(item1, item2, ...), | |
# that indicates the name of columns | |
# we want. These can also be accessed | |
# by df[, 'date'], df[,'item'], etc... | |
# for instance, we could subset the data | |
# the same way in this manner: | |
# df <- df[ ,c("date", "item", "today", "type")] | |
df <- subset(df, select=c("date", "item", "today", "type")) | |
# make withdrawals negative | |
# here we're only changing those | |
# elements of df$today where | |
# the type of the transaction | |
# is a withdrawal | |
# | |
# df$type=="withdrawal" is a | |
# vectorized comparison and returns | |
# an array of row indices that correspond | |
# with instances where this comparison is TRUE | |
# feeding the result of this comparsion into | |
# hard brackets, which allow for row references of | |
# arrays, allows us to perform our operation on | |
# a subset of the data | |
df$today[df$type=="withdrawal"] = df$today[df$type=="withdrawal"] * -1 | |
# function for reducing items by day | |
reduce_day <- function(x) { | |
# this functions sums all transactions for a given item | |
# on a given day (via tapply() ) and transposes them | |
# (via t() ) so that each day has | |
# one row and one column for each item on that day. | |
# the value of each column thus represents the net change | |
# in value for the corresponding item(or column name) | |
val_df <- data.frame(t(tapply(x$today, x$item, sum))) | |
val_df$date <- x$date[1] | |
return(val_df) | |
} | |
# apply this function across every day in our data. | |
# output in this a list since we're not sure of the | |
# full breadth of possible columns ( ddply() would break with this fx) | |
df_list <- dlply(df, "date", reduce_day, .progress="text") | |
# fill in columns, this is a plyr fx | |
df_final <- rbind.fill(df_list) | |
# fix names | |
names(df_final) <- tolower(gsub("[\\.]+", "_", str_trim(names(df_final)))) | |
# reformat date, | |
# this is a fx from the lubridate package | |
# lubridate infers date formats from strings | |
# its function names follow the date format | |
# you're trying to infer | |
# ie: 2012-06-21 => ymd() , 06/21/2012 => mdy(), 2012-06-21 19:45:02 => ymdhms() | |
df_final$date <- ymd(df_final$date) | |
# fix nas | |
df_final[is.na(df_final)] <- 0 | |
# take a head of this data | |
head(df_final) | |
# make a timeseries plot of each item and save | |
# it to a pdf for gif-i-fying | |
# create a palette of 9 colors to choose from | |
pal <- brewer.pal(9,"Set1") | |
plot_item <- function(name) { | |
# this is a function that, given a column name, | |
# creates a time series plot for the data associated | |
# with that column name (here, an item). | |
# randomly vary the color, line width, and line type | |
# in these plots | |
col = df_final[, grep(name, names(df_final))] | |
par(mai=c(0.7,0.7,0.2,0.2)) | |
plot(x=df_final$date[col>0], | |
y=col[col>0], | |
type="l", | |
lty=sample(1:5, 1), | |
lwd=sample(1:5, 1), | |
col=pal[sample(1:9, 1)], | |
bty="n", | |
xlab="date", | |
ylab="amount", | |
main=name ) | |
} | |
# loop through all the column names in our | |
# database and generate a plot | |
for (name in names(df_final)) { | |
if (name!="date") { | |
cat("printing", name, "\n") | |
# hack to catch errors in plots | |
# this essentially means we're generating | |
# the plot 2x | |
p = try(plot_item(name)) | |
if (class(p)=='try-error') { | |
cat("whups\n") | |
} else { | |
# write each plot to file | |
# open up a file | |
pdf(file=paste0(name, ".pdf"), width=5, height=5) | |
# generate a plot | |
plot_item(name) | |
# write that file | |
dev.off() | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment