Code for post about creating a dataviz using new ggplot2 features and cowplot. post" http://matthewdharris.com/2016/03/19/visualizing-zubrow-ggplot2-cowplot/ tweet: https://twitter.com/Md_Harris/status/710639632711618560
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("ggplot2") # Must use Dev version as of 03/18/16 | |
library("gridExtra") | |
library("extrafont") # for font selection | |
library("dplyr") # for data preperation | |
library("cowplot") # for combining plots | |
# Prepare data for plotting | |
# data from Zubrow, E.B.W. (1974), Population, Contact,and Climate in the New Mexican Pueblos | |
# prepared as a long format to facilitate plotting | |
year <- c(1760, 1790, 1797, 1850, 1860, 1889, 1900, 1910, 1950) | |
sites <- c("Isleta", "Acoma", "Laguna", "Zuni", "Sandia", "San Felipe", | |
"Santa Ana", "Zia", "Santo Domingo", "Jemez", "Cochiti", | |
"Tesuque", "Nambe", "San Ildefonso", "Pojoaque", "Santa Clara", | |
"San Juan", "Picuris", "Toas") | |
# [sites] are arranged in South to North geographic order | |
south <- c(seq_along(sites)) | |
# add index to rearrange geographically from East to West | |
east <- c(14, 18, 17, 19, 12, 11, 13, 15, 10, 16, 9, 4, 3, 7, 5, 8, 6, 2, 1) | |
# population figures | |
pop <- c( | |
c(304, 410, 603, 751, 440, 1037, 1035, 956, 1051), | |
c(1052, 820, 757, 367, 523, 582, 492, 691, 1376), | |
c(600, 668, 802, 749, 929, 970, 1077, 1472, 1655), | |
c(664, 1935, 2716, 1294, 1300, 1547, 1525, 1667, 2564), | |
c(291, 304, 116, 241, 217, 150, 81, 73, 150), | |
c(458, 532, 282, 800, 360, 501, 515, 502, 721), | |
c(404, 356, 634, 339, 316, 264, 228, 219, 285), | |
c(568, 275, 262, 124, 115, 113, 115, 109, 145), | |
c(424, 650, 483, 666, 262, 930, 771, 817, 978), | |
c(373, 485, 272, 365, 650, 474, 452, 449, 789), | |
c(450, 720, 505, 254, 172, 300, 247, 237, 289), | |
c(232, 138, 155, 119, 97, 94, 80, 80, 145), | |
c(204, 155, 178, 107, 107, 80, 81, 88, 96), | |
c(484, 240, 251, 319, 166, 189, 137, 114, 152), | |
c(99, 53, 79, 48, 37, 18, 12, 16, 2), | |
c(257, 134, 193, 279, 179, 187, 222, 243, 511), | |
c(316, 260, 202, 568, 343, 373, 422, 388, 152), | |
c(328, 254, 251, 222, 143, 120, 95, 104, 99), | |
c(505, 518, 531, 361, 363, 324, 462, 517, 842) | |
) | |
# combine above in a data.frame where each population is the key observation | |
dat <- data.frame(Year = rep(year, length(sites)), | |
Site = rep(sites, each = length(year)), | |
Population = pop, | |
South = rep(south, each = length(year)), | |
East = rep(east, each = length(year))) | |
# add a factor that arranges the levels of site by their East to West geographic location | |
dat$East_label <- rep(factor(sites, levels = (sites[order(east)])), each = length(year)) | |
# Calculate quantities of interest | |
# this ibe-liner calculates the date to date change in population | |
# ave() groups and runs the diff() function that is concatenated with a 0 for the first observation in each group | |
# probably can make this dplyr, but it works... | |
dat$Change <- ave(dat$Population, dat$Site, FUN = function(x) c(0, diff(x))) | |
# using dplyr to group the observations by site/Pueblo and calcualte stuff | |
dat <- group_by(dat, Site) %>% | |
# change from inception date of 1760 (first data point) | |
mutate(Relative_Change = Population - Population[1]) %>% | |
# percent change of log population (not used in this graphic) | |
mutate(Percent_Change = (Population - lag(Population))/lag(Population)) %>% | |
# change from inception scaled to % change from inception | |
mutate(Scaled_Change = (Relative_Change / Population[1])) %>% | |
# make into a data.frame | |
data.frame() | |
# select only the rows that are the last time period observation of 1950 | |
# this is for the second plot | |
net_change <- dat[which(dat$Year == 1950),] | |
# plotting with ggplot2 (dev version from github) | |
# I started doing the [p1 <- p1 +] way to build these b/c others did and | |
# I liked how I could add selective lines while testing without changing anything | |
# so, it is more things to type, but whatever. It can be done in one ggplot() call if you like | |
# also, I code to get stuff done then optimize later. You may likey find ways to do this better | |
# Plot #1: Percent change from inception for each site | |
# set up data and axis | |
p1 <- ggplot(dat, aes(x = as.factor(Year), y = Scaled_Change, group = East)) | |
# this is the bar plot part | |
p1 <- p1 + geom_bar(stat="identity", fill = "skyblue", color = "white") | |
# this is the redline outlining the bars | |
p1 <- p1 + geom_line(color = "firebrick1", alpha = 0.95) | |
# wrap it by the reordered factor so each individual site plots | |
p1 <- p1 + facet_wrap( ~ East_label, nrow = 2) | |
# add a reference line at zero | |
p1 <- p1 + geom_hline(yintercept = 0, linetype = 1, color = "gray35", size = 0.5) | |
# scale the y axis and calculate it as a percentage | |
p1 <- p1 + scale_y_continuous(breaks = c(seq(-1,3,1)), labels = scales::percent) | |
# thin out the dates in the x axis | |
p1 <- p1 + scale_x_discrete(breaks = c(1760, 1797, 1860, 1900, 1950)) | |
# trusty old theme_bw() to prepare the look of the plot | |
p1 <- p1 + theme_bw() | |
# fine suggestion by @ClausWilke to make plot less busy | |
p1 <- p1 + background_grid(major = "xy", minor = "none") | |
# add the title, subtitle, and axis labels | |
# this is a new feature only available in the dev version of ggplot2 | |
p1 <- p1 + labs(title="Percent Population Change from in New Mexico Pueblos from 1760 to 1950", | |
subtitle="Arranged from East to West", | |
x = "Year", | |
y = "Percent Population Change") | |
# all the things in the theme() section that stylize the plot and fonts | |
p1 <- p1 + theme( | |
strip.background = element_rect(colour = "white", fill = "white"), | |
strip.text.x = element_text(colour = "black", size = 7, face = "bold", | |
family = "Trebuchet MS"), | |
panel.margin = unit(0.3, "lines"), | |
panel.border = element_rect(colour = "gray90"), | |
axis.text.x = element_text(angle = 90, size = 6, family = "Trebuchet MS"), | |
axis.text.y = element_text(size = 6, family = "Trebuchet MS"), | |
axis.title = element_text(size = 8, family = "Trebuchet MS"), | |
# the three elements below are all new to the dev ersion | |
plot.caption = element_text(size = 8, hjust=0, margin=margin(t=5), | |
family = "Trebuchet MS"), | |
plot.title=element_text(family="TrebuchetMS-Bold"), | |
plot.subtitle=element_text(family="TrebuchetMS-Italic") | |
) | |
# Plot #2: Net change in population by site | |
# this uses a different data.frame from plot #1 | |
# most of this is all the same as Plot #1, but not faceted. | |
# one trick here is to have two geom_bar calls, but the second is just data below the 0 line | |
p2 <- ggplot(data = net_change, aes(x = East_label, y = Scaled_Change, group = Year)) | |
# first call to geom_bar for all data | |
p2 <- p2 + geom_bar(stat="identity", fill = "skyblue") | |
# second call to geom_bar for just negative data with a different color | |
p2 <- p2 + geom_bar(data = net_change[which(net_change$Scaled_Change < 0),], | |
stat="identity", fill = "skyblue4") | |
# a spline smooth to add the fit that shows the trend of population flow | |
p2 <- p2 + geom_smooth(data = transform(net_change), | |
method = "lm", formula = y ~ splines::bs(x, 3), | |
se = FALSE, color = "firebrick1", size = 0.5) | |
p2 <- p2 + geom_hline(yintercept = 0, linetype = 1, color = "gray35", size = 0.5) | |
p2 <- p2 + scale_y_continuous(breaks = c(seq(-1,3,0.5)), labels = scales::percent) | |
p2 <- p2 + theme_bw() | |
# fine suggestion by @ClausWilke to make plot less busy | |
p2 <- p2 + background_grid(major = "xy", minor = "none") | |
# same new features as described in plot #1 | |
p2 <- p2 + labs(title="Net Percent Population Change from 1760 to 1950", | |
subtitle="Arranged from East to West", | |
x = NULL, | |
y = "Percent Population Change", | |
caption="Data: Zubrow, E.B.W. (1974), Population, Contact,and Climate in the New Mexican Pueblos") | |
p2 <- p2 + theme( | |
axis.text.x = element_text(angle = 90, size = 8, hjust = 1, family = "Trebuchet MS"), | |
axis.text.y = element_text(size = 7, family = "Trebuchet MS"), | |
axis.title = element_text(size = 8, family = "Trebuchet MS"), | |
# same new features as described in plot #1 | |
plot.caption = element_text(size = 8, hjust=0, margin=margin(t=5), | |
family = "Trebuchet MS"), | |
plot.title=element_text(family="TrebuchetMS-Bold"), | |
plot.subtitle=element_text(family="TrebuchetMS-Italic") | |
) | |
# now the magic of cowplot to combine the plots | |
# plot_grid does the arranging of p1 and p2 | |
# [labels] gives identification to each plot for your figure caption | |
# [nrow] arrnges them in stacked rows | |
# [rel_heights] is really important. This allows for the top plot to be made larger (1.75 times) | |
# than the lower plot. If your plots are not realtively equal in size, you may need this | |
# I adjusted by trial and error | |
cplot1 <- plot_grid(p1, p2, labels = c("A", "B"), nrow = 2, rel_heights = c(1.75,1)) | |
# get an idea of what it looks like | |
plot(cplot1) | |
# save cowplot as an image! | |
# [base_width] and [base_height] allow of sizing the fina image Use in conjunction with [rel_heights] | |
save_plot("cowplot.png", cplot1, base_width = 8, base_height = 7.5) | |
# Done! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment