Skip to content

Instantly share code, notes, and snippets.

@abresler
Created January 27, 2015 06:22
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save abresler/b9a7896d456902a4b3d7 to your computer and use it in GitHub Desktop.
Save abresler/b9a7896d456902a4b3d7 to your computer and use it in GitHub Desktop.
#http://rpubs.com/bradleyboehmke/median_income_trends
# Preprocessing & summarizing data
library(dplyr)
library(tidyr)
library(RCurl)
library(XLConnect)
library(magrittr)
appURL <- "https://www.census.gov/hhes/www/income/data/historical/household/2013/h08.xls"
f = CFILE("exfile.xls", mode="wb")
curlPerform(url = appURL, writedata = f@ref, ssl.verifypeer = FALSE)
close(f)
out <- readWorksheetFromFile(file = "exfile.xls", sheet = "h08", header = FALSE
, startRow = 62, endRow = 113)
'State' -> names(out)[1]
out[,c(1,seq(2,61,2))] -> data
paste0("X",2013:1984) -> names(data)[2:31]
data %>% tbl_df -> data
data %>%
filter(State == "United States") %>%
gather(Year, Income, X2013:X1984) %>%
separate(Year, c("left","Year"), sep="X") %>%
select(-left) %>%
arrange(Year) -> us
# extract 1984 value as the baseline value and add to us dataframe
us[us$Year==1984,3] -> us_base
us_base$Income -> us$us_baseline
# calculate the percent change in U.S. median income for each year as compared
# to 1984 (aka the baseline)
us %>%
mutate(us_change = (Income-us_baseline)/us_baseline) -> us
# create a states dataframe, clean and turn into long format
data %>%
filter(State != "United States") %>%
gather(Year, Income, X2013:X1984) %>%
separate(Year, c("left","Year"), sep="X") %>%
select(-left) %>%
arrange(Year) %>%
filter(Income != "NA") -> states
# create baselines for each state
state_base <- states %>%
filter(Year == 1984) %>%
select(State, State_Baseline = Income)
# add baseline to the states and calculate the percent change in median income
# for each state as compared to 1984 (aka the baseline)
states %<>%
left_join(state_base) %>%
arrange(State) %>%
mutate(state_change = (Income-State_Baseline)/State_Baseline)
# change year variables from character to numeric
states$Year %<>% as.numeric
us$Year %<>% as.numeric
# get top 5 and bottom 5 states which will allow me to identify them
# graphically
states %>%
filter(Year == 2013) %>%
arrange(desc(state_change)) %>%
mutate(rank = seq(1,length(State), by=1)) %>%
filter(rank < 6 | rank > 46 ) -> rank
# Visualizatin development
library(grid)
library(scales)
library(ggplot2)
## Plot It
states %>%
ggplot(aes(Year, state_change, group=State)) +
theme_bw() +
theme(plot.background = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(linetype = 3, colour = "grey50"),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank()) +
geom_line(colour="grey90", alpha=.9) -> p
p
#Step 2
#Next, I plot the U.S. overall average with a dashed line.
p %<>% +
geom_line(data=us, aes(Year, us_change, group=1), linetype=5)
p
#Step 3
#I also wanted to identify how my own state (Ohio) has trended over the years so I filtered the states dataframe for only Ohio data and singled it out with a blue color.
p %<>% +
geom_line(data=filter(states, State=="Ohio"),
aes(Year, state_change, group=State), colour="dodgerblue",
size = 1)
p
#Step 4
#I wanted to create some boundaries with the biggest economic “winner” and “loser”. So I identified the state that had the largest growth from 1984 to 2013 and the state that had the greatest contraction and singled these out with a slightly darker grey than all the other states.
p %<>% +
geom_line(data=filter(states, State=="D.C."),
aes(Year, state_change, group=State), colour="grey70") +
geom_line(data=filter(states, State=="Nevada"),
aes(Year, state_change, group=State), colour="grey70")
p
#Step 5
#Next, I identify the top 5 and bottom 5 states along the overall US by plotting points on their 2013 values.
p %<>% +
geom_point(data=rank, aes(Year, state_change), shape=21, size=1.5, alpha=.6) +
geom_point(data=filter(us, Year == 2013), aes(Year, us_change), size=2.5, alpha=.6)
p
#Step 6
#the last step I performed in R was to format the x- and y-axis. For the y-axis I fixed the limits and breaks (this was primarily because I was tinkering around with the dimensions of the chart but wanted to keep the breaks fixed) and turned the labels to a percent format. For the x-axis I increased the breaks to every 5 years and reduced the padding at the ends of the axis. Also note that I extend the x-axis to 1983 even though my data only goes back to 1984. This is to add more space on the left side of the x-axis; the reason for this becomes evident in the final graphic where I move the y-axis labels.
p %<>% +
scale_y_continuous(limits=c(-.2,.55), breaks=seq(-.2,.4,by=.2), label=percent) +
scale_x_continuous(limits=c(1983,2013),breaks=seq(1985,2010,by=5), expand=c(0,.25))
p + ggtitle("Brad's Beautiful Time Series Plot") +
theme(plot.title=element_text(face="bold",hjust=.012,vjust=.8,colour="#3C3C3C",size=20))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment