abresler/beautiful_time_plot

## beautiful_time_plot
#http://rpubs.com/bradleyboehmke/median_income_trends
# Preprocessing & summarizing data
library(dplyr)
library(tidyr)
library(RCurl)
library(XLConnect)
library(magrittr)
appURL <- "https://www.census.gov/hhes/www/income/data/historical/household/2013/h08.xls"
f = CFILE("exfile.xls", mode="wb")
curlPerform(url = appURL, writedata = f@ref, ssl.verifypeer = FALSE)
close(f)
out <- readWorksheetFromFile(file = "exfile.xls", sheet = "h08", header = FALSE
														 , startRow = 62, endRow = 113)
'State' -> names(out)[1]
out[,c(1,seq(2,61,2))] -> data
paste0("X",2013:1984) -> names(data)[2:31]
data %>% tbl_df -> data

data %>%
	filter(State == "United States") %>%
	gather(Year, Income, X2013:X1984) %>%
	separate(Year, c("left","Year"), sep="X") %>%
	select(-left) %>%
	arrange(Year) -> us

# extract 1984 value as the baseline value and add to us dataframe
us[us$Year==1984,3] -> us_base
us_base$Income -> us$us_baseline

# calculate the percent change in U.S. median income for each year as compared
# to 1984 (aka the baseline)
us %>%
	mutate(us_change = (Income-us_baseline)/us_baseline) -> us
# create a states dataframe, clean and turn into long format
data %>%
	filter(State != "United States") %>%
	gather(Year, Income, X2013:X1984) %>%
	separate(Year, c("left","Year"), sep="X") %>%
	select(-left) %>%
	arrange(Year) %>%
	filter(Income != "NA") -> states

# create baselines for each state
state_base <- states %>%
	filter(Year == 1984) %>%
	select(State, State_Baseline = Income)

# add baseline to the states and calculate the percent change in median income
# for each state as compared to 1984 (aka the baseline)
states %<>%
	left_join(state_base) %>%
	arrange(State) %>%
	mutate(state_change = (Income-State_Baseline)/State_Baseline)

# change year variables from character to numeric
states$Year %<>% as.numeric
us$Year %<>% as.numeric

# get top 5 and bottom 5 states which will allow me to identify them
# graphically
states %>%
	filter(Year == 2013) %>%
	arrange(desc(state_change)) %>%
	mutate(rank = seq(1,length(State), by=1)) %>%
	filter(rank < 6 | rank > 46 ) -> rank

# Visualizatin development
library(grid)
library(scales)
library(ggplot2)

## Plot It

states %>%
	ggplot(aes(Year, state_change, group=State)) +
theme_bw() +
	theme(plot.background = element_blank(),
				panel.grid.minor = element_blank(),
				panel.grid.major.x = element_blank(),
				panel.grid.major.y = element_line(linetype = 3, colour = "grey50"),
				panel.border = element_blank(),
				panel.background = element_blank(),
				axis.ticks = element_blank(),
				axis.title = element_blank()) +
	geom_line(colour="grey90", alpha=.9) -> p

p

#Step 2

#Next, I plot the U.S. overall average with a dashed line.

p %<>% +
	geom_line(data=us, aes(Year, us_change, group=1), linetype=5)

p

#Step 3

#I also wanted to identify how my own state (Ohio) has trended over the years so I filtered the states dataframe for only Ohio data and singled it out with a blue color.

p %<>% +
	geom_line(data=filter(states, State=="Ohio"),
						aes(Year, state_change, group=State), colour="dodgerblue",
						size = 1)

p

#Step 4

#I wanted to create some boundaries with the biggest economic “winner” and “loser”. So I identified the state that had the largest growth from 1984 to 2013 and the state that had the greatest contraction and singled these out with a slightly darker grey than all the other states.

p %<>% +
	geom_line(data=filter(states, State=="D.C."),
						aes(Year, state_change, group=State), colour="grey70") +
	geom_line(data=filter(states, State=="Nevada"),
						aes(Year, state_change, group=State), colour="grey70")

p
#Step 5

#Next, I identify the top 5 and bottom 5 states along the overall US by plotting points on their 2013 values.

p %<>% +
	geom_point(data=rank, aes(Year, state_change), shape=21, size=1.5, alpha=.6) +
	geom_point(data=filter(us, Year == 2013), aes(Year, us_change), size=2.5, alpha=.6)

p

#Step 6

#the last step I performed in R was to format the x- and y-axis. For the y-axis I fixed the limits and breaks (this was primarily because I was tinkering around with the dimensions of the chart but wanted to keep the breaks fixed) and turned the labels to a percent format. For the x-axis I increased the breaks to every 5 years and reduced the padding at the ends of the axis. Also note that I extend the x-axis to 1983 even though my data only goes back to 1984. This is to add more space on the left side of the x-axis; the reason for this becomes evident in the final graphic where I move the y-axis labels.

p %<>% +
	scale_y_continuous(limits=c(-.2,.55), breaks=seq(-.2,.4,by=.2), label=percent) +
	scale_x_continuous(limits=c(1983,2013),breaks=seq(1985,2010,by=5), expand=c(0,.25))

p + ggtitle("Brad's Beautiful Time Series Plot") +
	theme(plot.title=element_text(face="bold",hjust=.012,vjust=.8,colour="#3C3C3C",size=20))
	#http://rpubs.com/bradleyboehmke/median_income_trends
	# Preprocessing & summarizing data
	library(dplyr)
	library(tidyr)
	library(RCurl)
	library(XLConnect)
	library(magrittr)
	appURL <- "https://www.census.gov/hhes/www/income/data/historical/household/2013/h08.xls"
	f = CFILE("exfile.xls", mode="wb")
	curlPerform(url = appURL, writedata = f@ref, ssl.verifypeer = FALSE)
	close(f)
	out <- readWorksheetFromFile(file = "exfile.xls", sheet = "h08", header = FALSE
	, startRow = 62, endRow = 113)
	'State' -> names(out)[1]
	out[,c(1,seq(2,61,2))] -> data
	paste0("X",2013:1984) -> names(data)[2:31]
	data %>% tbl_df -> data

	data %>%
	filter(State == "United States") %>%
	gather(Year, Income, X2013:X1984) %>%
	separate(Year, c("left","Year"), sep="X") %>%
	select(-left) %>%
	arrange(Year) -> us

	# extract 1984 value as the baseline value and add to us dataframe
	us[us$Year==1984,3] -> us_base
	us_base$Income -> us$us_baseline

	# calculate the percent change in U.S. median income for each year as compared
	# to 1984 (aka the baseline)
	us %>%
	mutate(us_change = (Income-us_baseline)/us_baseline) -> us
	# create a states dataframe, clean and turn into long format
	data %>%
	filter(State != "United States") %>%
	gather(Year, Income, X2013:X1984) %>%
	separate(Year, c("left","Year"), sep="X") %>%
	select(-left) %>%
	arrange(Year) %>%
	filter(Income != "NA") -> states

	# create baselines for each state
	state_base <- states %>%
	filter(Year == 1984) %>%
	select(State, State_Baseline = Income)

	# add baseline to the states and calculate the percent change in median income
	# for each state as compared to 1984 (aka the baseline)
	states %<>%
	left_join(state_base) %>%
	arrange(State) %>%
	mutate(state_change = (Income-State_Baseline)/State_Baseline)

	# change year variables from character to numeric
	states$Year %<>% as.numeric
	us$Year %<>% as.numeric

	# get top 5 and bottom 5 states which will allow me to identify them
	# graphically
	states %>%
	filter(Year == 2013) %>%
	arrange(desc(state_change)) %>%
	mutate(rank = seq(1,length(State), by=1)) %>%
	filter(rank < 6 \| rank > 46 ) -> rank

	# Visualizatin development
	library(grid)
	library(scales)
	library(ggplot2)

	## Plot It

	states %>%
	ggplot(aes(Year, state_change, group=State)) +
	theme_bw() +
	theme(plot.background = element_blank(),
	panel.grid.minor = element_blank(),
	panel.grid.major.x = element_blank(),
	panel.grid.major.y = element_line(linetype = 3, colour = "grey50"),
	panel.border = element_blank(),
	panel.background = element_blank(),
	axis.ticks = element_blank(),
	axis.title = element_blank()) +
	geom_line(colour="grey90", alpha=.9) -> p

	p

	#Step 2

	#Next, I plot the U.S. overall average with a dashed line.

	p %<>% +
	geom_line(data=us, aes(Year, us_change, group=1), linetype=5)

	p

	#Step 3

	#I also wanted to identify how my own state (Ohio) has trended over the years so I filtered the states dataframe for only Ohio data and singled it out with a blue color.

	p %<>% +
	geom_line(data=filter(states, State=="Ohio"),
	aes(Year, state_change, group=State), colour="dodgerblue",
	size = 1)

	p

	#Step 4

	#I wanted to create some boundaries with the biggest economic “winner” and “loser”. So I identified the state that had the largest growth from 1984 to 2013 and the state that had the greatest contraction and singled these out with a slightly darker grey than all the other states.

	p %<>% +
	geom_line(data=filter(states, State=="D.C."),
	aes(Year, state_change, group=State), colour="grey70") +
	geom_line(data=filter(states, State=="Nevada"),
	aes(Year, state_change, group=State), colour="grey70")

	p
	#Step 5

	#Next, I identify the top 5 and bottom 5 states along the overall US by plotting points on their 2013 values.

	p %<>% +
	geom_point(data=rank, aes(Year, state_change), shape=21, size=1.5, alpha=.6) +
	geom_point(data=filter(us, Year == 2013), aes(Year, us_change), size=2.5, alpha=.6)

	p

	#Step 6

	#the last step I performed in R was to format the x- and y-axis. For the y-axis I fixed the limits and breaks (this was primarily because I was tinkering around with the dimensions of the chart but wanted to keep the breaks fixed) and turned the labels to a percent format. For the x-axis I increased the breaks to every 5 years and reduced the padding at the ends of the axis. Also note that I extend the x-axis to 1983 even though my data only goes back to 1984. This is to add more space on the left side of the x-axis; the reason for this becomes evident in the final graphic where I move the y-axis labels.

	p %<>% +
	scale_y_continuous(limits=c(-.2,.55), breaks=seq(-.2,.4,by=.2), label=percent) +
	scale_x_continuous(limits=c(1983,2013),breaks=seq(1985,2010,by=5), expand=c(0,.25))

	p + ggtitle("Brad's Beautiful Time Series Plot") +
	theme(plot.title=element_text(face="bold",hjust=.012,vjust=.8,colour="#3C3C3C",size=20))