Last active
September 25, 2015 16:15
-
-
Save davidski/3fb7e7578e22b5d51df8 to your computer and use it in GitHub Desktop.
Coursera RStudio First Snowfall Example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
https://www.rstudio.com/resources/cheatsheets/ | |
Particularly | |
- Data Wrangling - https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf | |
- Data Visualization - https://www.rstudio.com/wp-content/uploads/2015/08/ggplot2-cheatsheet.pdf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Name: Snowfirst.R | |
# Date: 9/25/2015 | |
# Purpose: Demo code for Coursera Study Group | |
# Almost entirely based upon the work of Bob Rudis (@hrbrmstr) | |
# https://github.com/hrbrmstr/snowfirst | |
# install any missing packages | |
list_of_packages <- c("pbapply", "dplyr", "data.table", "ggplot2", "grid", | |
"gridExtra", "stringr", "readr") | |
new_packages <- list_of_packages[!(list_of_packages %in% | |
installed.packages()[,"Package"])] | |
if(length(new_packages)) install.packages(new_packages) | |
# load packages | |
library(pbapply) | |
library(dplyr) | |
library(data.table) | |
library(ggplot2) | |
library(grid) | |
library(gridExtra) | |
library(stringr) | |
library(readr) | |
# United States Historical Climatology Network Daily Dataset Archive: | |
# http://cdiac.ornl.gov/ftp/ushcn_daily/ | |
# Station file format: | |
# http://cdiac.ornl.gov/ftp/ushcn_daily/station_file_format.txt | |
if (!file.exists("data/ushcn-stations.txt")) { | |
download.file("http://cdiac.ornl.gov/ftp/ushcn_daily/ushcn-stations.txt", "data/ushcn-stations.txt") | |
} | |
stations <- read_fwf("data/ushcn-stations.txt", | |
fwf_widths(c(6, 9, 10, 7, 3, 31, 7, 7, 7, 3), | |
col_names=c("coop_id", "latitude", "longitude", "elevation", | |
"state", "name", "component_1", "component_2", | |
"component_3", "utc_offset")), | |
col_types=c("cnnncccccc")) | |
# not a great circle, but it gets the job done here | |
closestStation <- function(stations, lat, lon) { | |
index <- which.min(sqrt((stations$latitude-lat)^2 + | |
(stations$longitude-lon)^2)) | |
stations[index,] | |
} | |
# define latlongs for locations of interest | |
locations <- data.frame(location=c("Paradise", "Bellingham"), | |
lat=c(46.7858, 48.7502), | |
long=c(-121.737, -122.4750)) | |
# Paradise, WA ------------------------------------------------------------ | |
# what's the closest station? | |
# we'd like to use Seattle, but there isn't enough snow here :( | |
#closestStation(stations, 47.6097, -122.3331) | |
closestStation(stations, locations[1,]$lat, locations[1,]$long) | |
our_station <- closestStation(stations, locations[1,]$lat, locations[1,]$long)$coop_id | |
if (!file.exists("data/state45_WA.txt.gz")) { | |
download.file("http://cdiac.ornl.gov/ftp/ushcn_daily/state45_WA.txt.gz", "data/state45_WA.txt.gz") | |
} | |
# we aren't using read.fwf since the records are wide and we want them long | |
# so we just read in the records line-by-line | |
snow <- readLines(gzfile("data/state45_WA.txt.gz")) | |
# we're only looking for SNOW records | |
snow <- grep("SNOW", snow, value=TRUE) | |
# and, we're only looking for a particular station | |
snow_paradise <- grep(paste0("^", our_station), snow, value=TRUE) | |
# now that we have those, we'll make our data.table by iterating over each | |
# SNOW record and build a data.table from the daily records, then combine | |
# all of the SNOW records into one data.table | |
snow_dat <- rbindlist(pblapply(snow_paradise, function(x) { | |
rbindlist(lapply(1:31, function(i) { | |
# record format described here: | |
# http://cdiac.ornl.gov/ftp/ushcn_daily/data_format.txt | |
start <- 17 + (i-1)*8 | |
list(coop_id=substr(x, 1, 6), | |
date=sprintf("%s-%02d-%02d", substr(x, 7, 10), as.numeric(substr(x, 11, 12)), i), | |
element=substr(x, 13, 16), | |
value=as.numeric(substr(x, start, start+4)), | |
mflag=substr(x, start+5, start+5), | |
qflag=substr(x, start+6, start+6), | |
sflag=substr(x, start+7, start+7)) | |
})) | |
})) | |
# -9999 indicates no data or invalid data, so ignore those records | |
# we could also make them NA, but that's not necessary for this exercise | |
snow_dat <- snow_dat %>% filter(value != -9999) | |
# since the data file has 31 days for each records regardless of whether | |
# that's valid or not we do a shortcut to remove invalid dates by doing the | |
# a vectorized Date conversion, then removing records with NA dates | |
snow_dat$date <- as.Date(snow_dat$date) | |
snow_dat <- snow_dat %>% filter(!is.na(date)) | |
# having the year extracted is handy for filtering | |
snow_dat$year <- format(snow_dat$date, "%Y") | |
# since the Winter season in the US spans years, we have to work with the | |
# "day of year" and then munge it for Jan-May dates (we assume the first snowfall | |
# was never in June, but is prbly not not be true for Alaska, so user-beware) | |
snow_dat$doy <- as.numeric(format(snow_dat$date, "%j")) | |
snow_dat$doy <- ifelse(snow_dat$doy<=180, | |
snow_dat$doy + as.numeric(format(as.Date(sprintf("%s-12-31", snow_dat$year)), "%j")), | |
snow_dat$doy) | |
# now the fun begins | |
first <- snow_dat %>% | |
filter(value>0) %>% # ignore 0 values | |
filter(date>=as.Date("1950-01-01")) %>% # start at 1950 (arbitrary) | |
merge(stations, by="coop_id", all.x=TRUE) %>% # merge station details | |
group_by(coop_id, year) %>% # group by station and year | |
arrange(doy) %>% # sort by our munged day of year | |
filter(row_number(doy) == 1) %>% # grab the first entry by group | |
select(name, state, date, value, doy) # we only need some variables | |
# make a nice title for the visualization | |
title_1 <- sprintf("First observed snowfall (historical) at %s, %s", | |
str_to_title(unique(first$name)), | |
unique(first$state)) | |
# an overused plot by me (Rudis), but I really like the dot-line charts and we | |
# add a twist by using a snowflake for the point and use icy blue colors | |
gg <- ggplot(first, aes(y=year, x=doy)) | |
gg <- gg + geom_segment(aes(xend=min(first$doy)-20, yend=year), color="#9ecae1", size=0.25) | |
gg <- gg + geom_point(aes(color=coop_id), shape=8, size=3, color="#3182bd") | |
gg <- gg + geom_text(aes(label=format(date, "%b-%d")), size=3, hjust=-0.2) | |
gg <- gg + scale_x_continuous(expand=c(0, 0), limits=c(min(first$doy)-20, max(first$doy)+20)) | |
gg <- gg + labs(x=NULL, y=NULL, title=title_1) | |
gg <- gg + theme_bw() | |
gg <- gg + theme(legend.position="none") | |
gg <- gg + theme(panel.grid=element_blank()) | |
gg <- gg + theme(panel.border=element_blank()) | |
gg <- gg + theme(axis.ticks.x=element_blank()) | |
gg <- gg + theme(axis.ticks.y=element_blank()) | |
gg <- gg + theme(axis.text.x=element_blank()) | |
gg <- gg + theme(axis.text.y=element_text(color="#08306b")) | |
by_year <- gg | |
# we're going to pair the dot-line plot with a boxplot, but I also want to | |
# give some indication of the most likely range for the first snowfall, so | |
# we grab the quartiles via summary and use them to annotate the second graph | |
wx_range <- summary(as.Date(format(first$date, "2013-%m-%d"))) | |
names(wx_range) <- NULL | |
min_wx <- gsub("2013-", "", wx_range[2]) | |
max_wx <- gsub("2013-", "", wx_range[5]) | |
title_2 <- sprintf("Most likely first snowfall will be between %s & %s", min_wx, max_wx) | |
# we use a trick to line up the box plot with the dot-line plot by | |
# using the same character width y-axis labels but making them the background | |
# color (in this case, white) and keeping the x-axis limits the same. there | |
# may be another way to do this but this is quick, and easy to remember. | |
# a violin plot would work here as well | |
gg <- ggplot(first %>% mutate(name="0000"), aes(name, doy)) | |
gg <- gg + geom_boxplot(fill="#3182bd", color="#08306b", outlier.colour="#08306b") | |
gg <- gg + scale_y_continuous(expand=c(0, 0), | |
limits=c(min(first$doy)-20, max(first$doy)+20)) | |
gg <- gg + coord_flip() | |
gg <- gg + labs(x=NULL, y=NULL, title=title_2) | |
gg <- gg + theme_bw() | |
gg <- gg + theme(legend.position="none") | |
gg <- gg + theme(panel.grid=element_blank()) | |
gg <- gg + theme(panel.border=element_blank()) | |
gg <- gg + theme(axis.ticks.x=element_blank()) | |
gg <- gg + theme(axis.text.x=element_blank()) | |
gg <- gg + theme(axis.ticks.y=element_line(color="white")) | |
gg <- gg + theme(axis.text.y=element_text(color="white")) | |
gg <- gg + theme(plot.title=element_text(size=11)) | |
box_wx <- gg | |
# final presentation | |
grid.arrange(by_year, box_wx, nrow=2, heights=unit(c(0.9, 0.1), "npc")) | |
# BONUS - Compare with Bellingham, WA ------------------------------------- | |
# we'll repeat the analysis, looking at Bellingham, WA | |
our_station <- closestStation(stations, locations[2,]$lat, locations[2,]$long)$coop_id | |
# and, we're only looking for a particular station | |
snow_bellingham <- grep(paste0("^", our_station), snow, value=TRUE) | |
# now that we have those, we'll make our data.table by iterating over each | |
# SNOW record and build a data.table from the daily records, then combine | |
# all of the SNOW records into one data.table | |
snow_dat <- rbindlist(pblapply(snow_bellingham, function(x) { | |
rbindlist(lapply(1:31, function(i) { | |
# record format described here: | |
# http://cdiac.ornl.gov/ftp/ushcn_daily/data_format.txt | |
start <- 17 + (i-1)*8 | |
list(coop_id=substr(x, 1, 6), | |
date=sprintf("%s-%02d-%02d", substr(x, 7, 10), as.numeric(substr(x, 11, 12)), i), | |
element=substr(x, 13, 16), | |
value=as.numeric(substr(x, start, start+4)), | |
mflag=substr(x, start+5, start+5), | |
qflag=substr(x, start+6, start+6), | |
sflag=substr(x, start+7, start+7)) | |
})) | |
})) | |
# -9999 indicates no data or invalid data, so ignore those records | |
# we could also make them NA, but that's not necessary for this exercise | |
snow_dat <- snow_dat %>% filter(value != -9999) | |
# since the data file has 31 days for each records regardless of whether | |
# that's valid or not we do a shortcut to remove invalid dates by doing the | |
# a vectorized Date conversion, then removing records with NA dates | |
snow_dat$date <- as.Date(snow_dat$date) | |
snow_dat <- snow_dat %>% filter(!is.na(date)) | |
# having the year extracted is handy for filtering | |
snow_dat$year <- format(snow_dat$date, "%Y") | |
# since the Winter season in the US spans years, we have to work with the | |
# "day of year" and then munge it for Jan-May dates (we assume the first snowfall | |
# was never in June, but is prbly not not be true for Alaska, so user-beware) | |
snow_dat$doy <- as.numeric(format(snow_dat$date, "%j")) | |
snow_dat$doy <- ifelse(snow_dat$doy<=180, | |
snow_dat$doy + as.numeric(format(as.Date(sprintf("%s-12-31", snow_dat$year)), "%j")), | |
snow_dat$doy) | |
# now the fun begins | |
first <- snow_dat %>% | |
filter(value>0) %>% # ignore 0 values | |
filter(date>=as.Date("1950-01-01")) %>% # start at 1950 (arbitrary) | |
merge(stations, by="coop_id", all.x=TRUE) %>% # merge station details | |
group_by(coop_id, year) %>% # group by station and year | |
arrange(doy) %>% # sort by our munged day of year | |
filter(row_number(doy) == 1) %>% # grab the first entry by group | |
select(name, state, date, value, doy) # we only need some variables | |
# make a nice title for the visualization | |
title_1 <- sprintf("First observed snowfall (historical) at %s, %s", | |
str_to_title(unique(first$name)), | |
unique(first$state)) | |
# an overused plot by me (Rudis), but I really like the dot-line charts and we | |
# add a twist by using a snowflake for the point and use icy blue colors | |
gg <- ggplot(first, aes(y=year, x=doy)) | |
gg <- gg + geom_segment(aes(xend=min(first$doy)-20, yend=year), color="#9ecae1", size=0.25) | |
gg <- gg + geom_point(aes(color=coop_id), shape=8, size=3, color="#3182bd") | |
gg <- gg + geom_text(aes(label=format(date, "%b-%d")), size=3, hjust=-0.2) | |
gg <- gg + scale_x_continuous(expand=c(0, 0), limits=c(min(first$doy)-20, max(first$doy)+20)) | |
gg <- gg + labs(x=NULL, y=NULL, title=title_1) | |
gg <- gg + theme_bw() | |
gg <- gg + theme(legend.position="none") | |
gg <- gg + theme(panel.grid=element_blank()) | |
gg <- gg + theme(panel.border=element_blank()) | |
gg <- gg + theme(axis.ticks.x=element_blank()) | |
gg <- gg + theme(axis.ticks.y=element_blank()) | |
gg <- gg + theme(axis.text.x=element_blank()) | |
gg <- gg + theme(axis.text.y=element_text(color="#08306b")) | |
by_year_bonus <- gg | |
# we're going to pair the dot-line plot with a boxplot, but I also want to | |
# give some indication of the most likely range for the first snowfall, so | |
# we grab the quartiles via summary and use them to annotate the second graph | |
wx_range <- summary(as.Date(format(first$date, "2013-%m-%d"))) | |
names(wx_range) <- NULL | |
min_wx <- gsub("2013-", "", wx_range[2]) | |
max_wx <- gsub("2013-", "", wx_range[5]) | |
title_2 <- sprintf("Most likely first snowfall will be between %s & %s", min_wx, max_wx) | |
# we use a trick to line up the box plot with the dot-line plot by | |
# using the same character width y-axis labels but making them the background | |
# color (in this case, white) and keeping the x-axis limits the same. there | |
# may be another way to do this but this is quick, and easy to remember. | |
# a violin plot would work here as well | |
gg <- ggplot(first %>% mutate(name="0000"), aes(name, doy)) | |
gg <- gg + geom_boxplot(fill="#3182bd", color="#08306b", outlier.colour="#08306b") | |
gg <- gg + scale_y_continuous(expand=c(0, 0), | |
limits=c(min(first$doy)-20, max(first$doy)+20)) | |
gg <- gg + coord_flip() | |
gg <- gg + labs(x=NULL, y=NULL, title=title_2) | |
gg <- gg + theme_bw() | |
gg <- gg + theme(legend.position="none") | |
gg <- gg + theme(panel.grid=element_blank()) | |
gg <- gg + theme(panel.border=element_blank()) | |
gg <- gg + theme(axis.ticks.x=element_blank()) | |
gg <- gg + theme(axis.text.x=element_blank()) | |
gg <- gg + theme(axis.ticks.y=element_line(color="white")) | |
gg <- gg + theme(axis.text.y=element_text(color="white")) | |
gg <- gg + theme(plot.title=element_text(size=11)) | |
box_wx_bonus <- gg | |
# Side by Side Comparison ------------------------------------------------- | |
# Note: npc == normalised parent coordinates | |
wa_paradise <- arrangeGrob(by_year, box_wx, nrow=2, heights=unit(c(0.9, 0.1), "npc")) | |
wa_bellingham <- arrangeGrob(by_year_bonus, box_wx_bonus, nrow=2, heights=unit(c(0.9, 0.1), "npc")) | |
grid.arrange(wa_paradise, wa_bellingham, ncol=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment