Skip to content

Instantly share code, notes, and snippets.

@abelsonlive
Created July 12, 2013 06:51
Show Gist options
  • Save abelsonlive/5982440 to your computer and use it in GitHub Desktop.
Save abelsonlive/5982440 to your computer and use it in GitHub Desktop.
munge treasury.io data for table_ii into data-sciencey format.
# load in libraries
library("lubridate")
library("plyr")
library("RColorBrewer")
# where is your data located, lets switch to that dir!
PATH_TO_DATA_DIR <- "~/Dropbox/code/federal-treasury-api/data/lifetime_csv"
setwd(PATH_TO_DATA_DIR)
# read in csv. make sure to have stringsAsFactors=FALSE
df <- read.csv("table_ii.csv", stringsAsFactors=FALSE)
# take a specific set of columns
# here the select arg takes an array,
# specified by c(item1, item2, ...),
# that indicates the name of columns
# we want. These can also be accessed
# by df[, 'date'], df[,'item'], etc...
# for instance, we could subset the data
# the same way in this manner:
# df <- df[ ,c("date", "item", "today", "type")]
df <- subset(df, select=c("date", "item", "today", "type"))
# make withdrawals negative
# here we're only changing those
# elements of df$today where
# the type of the transaction
# is a withdrawal
#
# df$type=="withdrawal" is a
# vectorized comparison and returns
# an array of row indices that correspond
# with instances where this comparison is TRUE
# feeding the result of this comparsion into
# hard brackets, which allow for row references of
# arrays, allows us to perform our operation on
# a subset of the data
df$today[df$type=="withdrawal"] = df$today[df$type=="withdrawal"] * -1
# function for reducing items by day
reduce_day <- function(x) {
# this functions sums all transactions for a given item
# on a given day (via tapply() ) and transposes them
# (via t() ) so that each day has
# one row and one column for each item on that day.
# the value of each column thus represents the net change
# in value for the corresponding item(or column name)
val_df <- data.frame(t(tapply(x$today, x$item, sum)))
val_df$date <- x$date[1]
return(val_df)
}
# apply this function across every day in our data.
# output in this a list since we're not sure of the
# full breadth of possible columns ( ddply() would break with this fx)
df_list <- dlply(df, "date", reduce_day, .progress="text")
# fill in columns, this is a plyr fx
df_final <- rbind.fill(df_list)
# fix names
names(df_final) <- tolower(gsub("[\\.]+", "_", str_trim(names(df_final))))
# reformat date,
# this is a fx from the lubridate package
# lubridate infers date formats from strings
# its function names follow the date format
# you're trying to infer
# ie: 2012-06-21 => ymd() , 06/21/2012 => mdy(), 2012-06-21 19:45:02 => ymdhms()
df_final$date <- ymd(df_final$date)
# fix nas
df_final[is.na(df_final)] <- 0
# take a head of this data
head(df_final)
# make a timeseries plot of each item and save
# it to a pdf for gif-i-fying
# create a palette of 9 colors to choose from
pal <- brewer.pal(9,"Set1")
plot_item <- function(name) {
# this is a function that, given a column name,
# creates a time series plot for the data associated
# with that column name (here, an item).
# randomly vary the color, line width, and line type
# in these plots
col = df_final[, grep(name, names(df_final))]
par(mai=c(0.7,0.7,0.2,0.2))
plot(x=df_final$date[col>0],
y=col[col>0],
type="l",
lty=sample(1:5, 1),
lwd=sample(1:5, 1),
col=pal[sample(1:9, 1)],
bty="n",
xlab="date",
ylab="amount",
main=name )
}
# loop through all the column names in our
# database and generate a plot
for (name in names(df_final)) {
if (name!="date") {
cat("printing", name, "\n")
# hack to catch errors in plots
# this essentially means we're generating
# the plot 2x
p = try(plot_item(name))
if (class(p)=='try-error') {
cat("whups\n")
} else {
# write each plot to file
# open up a file
pdf(file=paste0(name, ".pdf"), width=5, height=5)
# generate a plot
plot_item(name)
# write that file
dev.off()
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment