abelsonlive/treasury_munging.R

## treasury_munging.R
# load in libraries
library("lubridate")
library("plyr")
library("RColorBrewer")

# where is your data located, lets switch to that dir!
PATH_TO_DATA_DIR <- "~/Dropbox/code/federal-treasury-api/data/lifetime_csv"
setwd(PATH_TO_DATA_DIR)

# read in csv. make sure to have stringsAsFactors=FALSE
df <- read.csv("table_ii.csv", stringsAsFactors=FALSE)

# take a specific set of columns
# here the select arg takes an array,
# specified by c(item1, item2, ...),
# that indicates the name of columns
# we want. These can also be accessed
# by df[, 'date'], df[,'item'], etc...
# for instance, we could subset the data
# the same way in this manner:
# df <- df[ ,c("date", "item", "today", "type")]
df <- subset(df, select=c("date", "item", "today", "type"))

# make withdrawals negative

# here we're only changing those
# elements of df$today where
# the type of the transaction
# is a withdrawal
#
# df$type=="withdrawal" is a
# vectorized comparison and returns
# an array of row indices that correspond
# with instances where this comparison is TRUE
# feeding the result of this comparsion into
# hard brackets, which allow for row references of
# arrays, allows us to perform our operation on
# a subset of the data
df$today[df$type=="withdrawal"] = df$today[df$type=="withdrawal"] * -1


# function for reducing items by day
reduce_day <- function(x) {
  # this functions sums all transactions for a given item
  # on a given day (via tapply() ) and transposes them
  # (via t() ) so that each day has
  # one row and one column for each item on that day.
  # the value of each column thus represents the net change
  # in value for the corresponding item(or column name)
  val_df <- data.frame(t(tapply(x$today, x$item, sum)))
  val_df$date <- x$date[1]
  return(val_df)
}

# apply this function across every day in our data.
# output in this a list since we're not sure of the
# full breadth of possible columns ( ddply() would break with this fx)
df_list <- dlply(df, "date", reduce_day, .progress="text")

# fill in columns, this is a plyr fx
df_final <- rbind.fill(df_list)

# fix names
names(df_final) <- tolower(gsub("[\\.]+", "_", str_trim(names(df_final))))

# reformat date,
# this is a fx from the lubridate package
# lubridate infers date formats from strings
# its function names follow the date format
# you're trying to infer
# ie: 2012-06-21 => ymd() , 06/21/2012 => mdy(), 2012-06-21 19:45:02 => ymdhms()
df_final$date <- ymd(df_final$date)

# fix nas
df_final[is.na(df_final)] <- 0

# take a head of this data
head(df_final)

# make a timeseries plot of each item and save
# it to a pdf for gif-i-fying

# create a palette of 9 colors to choose from
pal <- brewer.pal(9,"Set1")

plot_item <- function(name) {
  # this is  a function that, given a column name,
  # creates a time series plot for the data associated
  # with that column name (here, an item).
  # randomly vary the color, line width, and line type
  # in these plots
  col = df_final[, grep(name, names(df_final))]
  par(mai=c(0.7,0.7,0.2,0.2))
  plot(x=df_final$date[col>0],
       y=col[col>0],
       type="l",
       lty=sample(1:5, 1),
       lwd=sample(1:5, 1),
       col=pal[sample(1:9, 1)],
       bty="n",
       xlab="date",
       ylab="amount",
       main=name )
}
# loop through all the column names in our
# database and generate a plot
for (name in names(df_final)) {
  if (name!="date") {
    cat("printing", name, "\n")
    # hack to catch errors in plots
    # this essentially means we're generating
    # the plot 2x
    p = try(plot_item(name))
    if (class(p)=='try-error') {
      cat("whups\n")
    } else {
      # write each plot to file

      # open up a file
      pdf(file=paste0(name, ".pdf"), width=5, height=5)

      # generate a plot
      plot_item(name)

      # write that file
      dev.off()
    }
  }
}
	# load in libraries
	library("lubridate")
	library("plyr")
	library("RColorBrewer")

	# where is your data located, lets switch to that dir!
	PATH_TO_DATA_DIR <- "~/Dropbox/code/federal-treasury-api/data/lifetime_csv"
	setwd(PATH_TO_DATA_DIR)

	# read in csv. make sure to have stringsAsFactors=FALSE
	df <- read.csv("table_ii.csv", stringsAsFactors=FALSE)

	# take a specific set of columns
	# here the select arg takes an array,
	# specified by c(item1, item2, ...),
	# that indicates the name of columns
	# we want. These can also be accessed
	# by df[, 'date'], df[,'item'], etc...
	# for instance, we could subset the data
	# the same way in this manner:
	# df <- df[ ,c("date", "item", "today", "type")]
	df <- subset(df, select=c("date", "item", "today", "type"))

	# make withdrawals negative

	# here we're only changing those
	# elements of df$today where
	# the type of the transaction
	# is a withdrawal
	#
	# df$type=="withdrawal" is a
	# vectorized comparison and returns
	# an array of row indices that correspond
	# with instances where this comparison is TRUE
	# feeding the result of this comparsion into
	# hard brackets, which allow for row references of
	# arrays, allows us to perform our operation on
	# a subset of the data
	df$today[df$type=="withdrawal"] = df$today[df$type=="withdrawal"] * -1


	# function for reducing items by day
	reduce_day <- function(x) {
	# this functions sums all transactions for a given item
	# on a given day (via tapply() ) and transposes them
	# (via t() ) so that each day has
	# one row and one column for each item on that day.
	# the value of each column thus represents the net change
	# in value for the corresponding item(or column name)
	val_df <- data.frame(t(tapply(x$today, x$item, sum)))
	val_df$date <- x$date[1]
	return(val_df)
	}

	# apply this function across every day in our data.
	# output in this a list since we're not sure of the
	# full breadth of possible columns ( ddply() would break with this fx)
	df_list <- dlply(df, "date", reduce_day, .progress="text")

	# fill in columns, this is a plyr fx
	df_final <- rbind.fill(df_list)

	# fix names
	names(df_final) <- tolower(gsub("[\\.]+", "_", str_trim(names(df_final))))

	# reformat date,
	# this is a fx from the lubridate package
	# lubridate infers date formats from strings
	# its function names follow the date format
	# you're trying to infer
	# ie: 2012-06-21 => ymd() , 06/21/2012 => mdy(), 2012-06-21 19:45:02 => ymdhms()
	df_final$date <- ymd(df_final$date)

	# fix nas
	df_final[is.na(df_final)] <- 0

	# take a head of this data
	head(df_final)

	# make a timeseries plot of each item and save
	# it to a pdf for gif-i-fying

	# create a palette of 9 colors to choose from
	pal <- brewer.pal(9,"Set1")

	plot_item <- function(name) {
	# this is a function that, given a column name,
	# creates a time series plot for the data associated
	# with that column name (here, an item).
	# randomly vary the color, line width, and line type
	# in these plots
	col = df_final[, grep(name, names(df_final))]
	par(mai=c(0.7,0.7,0.2,0.2))
	plot(x=df_final$date[col>0],
	y=col[col>0],
	type="l",
	lty=sample(1:5, 1),
	lwd=sample(1:5, 1),
	col=pal[sample(1:9, 1)],
	bty="n",
	xlab="date",
	ylab="amount",
	main=name )
	}
	# loop through all the column names in our
	# database and generate a plot
	for (name in names(df_final)) {
	if (name!="date") {
	cat("printing", name, "\n")
	# hack to catch errors in plots
	# this essentially means we're generating
	# the plot 2x
	p = try(plot_item(name))
	if (class(p)=='try-error') {
	cat("whups\n")
	} else {
	# write each plot to file

	# open up a file
	pdf(file=paste0(name, ".pdf"), width=5, height=5)

	# generate a plot
	plot_item(name)

	# write that file
	dev.off()
	}
	}
	}