Skip to content

Instantly share code, notes, and snippets.

@davidski
Last active September 25, 2015 16:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save davidski/3fb7e7578e22b5d51df8 to your computer and use it in GitHub Desktop.
Save davidski/3fb7e7578e22b5d51df8 to your computer and use it in GitHub Desktop.
Coursera RStudio First Snowfall Example
https://www.rstudio.com/resources/cheatsheets/
Particularly
- Data Wrangling - https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
- Data Visualization - https://www.rstudio.com/wp-content/uploads/2015/08/ggplot2-cheatsheet.pdf
# Name: Snowfirst.R
# Date: 9/25/2015
# Purpose: Demo code for Coursera Study Group
# Almost entirely based upon the work of Bob Rudis (@hrbrmstr)
# https://github.com/hrbrmstr/snowfirst
# install any missing packages
list_of_packages <- c("pbapply", "dplyr", "data.table", "ggplot2", "grid",
"gridExtra", "stringr", "readr")
new_packages <- list_of_packages[!(list_of_packages %in%
installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# load packages
library(pbapply)
library(dplyr)
library(data.table)
library(ggplot2)
library(grid)
library(gridExtra)
library(stringr)
library(readr)
# United States Historical Climatology Network Daily Dataset Archive:
# http://cdiac.ornl.gov/ftp/ushcn_daily/
# Station file format:
# http://cdiac.ornl.gov/ftp/ushcn_daily/station_file_format.txt
if (!file.exists("data/ushcn-stations.txt")) {
download.file("http://cdiac.ornl.gov/ftp/ushcn_daily/ushcn-stations.txt", "data/ushcn-stations.txt")
}
stations <- read_fwf("data/ushcn-stations.txt",
fwf_widths(c(6, 9, 10, 7, 3, 31, 7, 7, 7, 3),
col_names=c("coop_id", "latitude", "longitude", "elevation",
"state", "name", "component_1", "component_2",
"component_3", "utc_offset")),
col_types=c("cnnncccccc"))
# not a great circle, but it gets the job done here
closestStation <- function(stations, lat, lon) {
index <- which.min(sqrt((stations$latitude-lat)^2 +
(stations$longitude-lon)^2))
stations[index,]
}
# define latlongs for locations of interest
locations <- data.frame(location=c("Paradise", "Bellingham"),
lat=c(46.7858, 48.7502),
long=c(-121.737, -122.4750))
# Paradise, WA ------------------------------------------------------------
# what's the closest station?
# we'd like to use Seattle, but there isn't enough snow here :(
#closestStation(stations, 47.6097, -122.3331)
closestStation(stations, locations[1,]$lat, locations[1,]$long)
our_station <- closestStation(stations, locations[1,]$lat, locations[1,]$long)$coop_id
if (!file.exists("data/state45_WA.txt.gz")) {
download.file("http://cdiac.ornl.gov/ftp/ushcn_daily/state45_WA.txt.gz", "data/state45_WA.txt.gz")
}
# we aren't using read.fwf since the records are wide and we want them long
# so we just read in the records line-by-line
snow <- readLines(gzfile("data/state45_WA.txt.gz"))
# we're only looking for SNOW records
snow <- grep("SNOW", snow, value=TRUE)
# and, we're only looking for a particular station
snow_paradise <- grep(paste0("^", our_station), snow, value=TRUE)
# now that we have those, we'll make our data.table by iterating over each
# SNOW record and build a data.table from the daily records, then combine
# all of the SNOW records into one data.table
snow_dat <- rbindlist(pblapply(snow_paradise, function(x) {
rbindlist(lapply(1:31, function(i) {
# record format described here:
# http://cdiac.ornl.gov/ftp/ushcn_daily/data_format.txt
start <- 17 + (i-1)*8
list(coop_id=substr(x, 1, 6),
date=sprintf("%s-%02d-%02d", substr(x, 7, 10), as.numeric(substr(x, 11, 12)), i),
element=substr(x, 13, 16),
value=as.numeric(substr(x, start, start+4)),
mflag=substr(x, start+5, start+5),
qflag=substr(x, start+6, start+6),
sflag=substr(x, start+7, start+7))
}))
}))
# -9999 indicates no data or invalid data, so ignore those records
# we could also make them NA, but that's not necessary for this exercise
snow_dat <- snow_dat %>% filter(value != -9999)
# since the data file has 31 days for each records regardless of whether
# that's valid or not we do a shortcut to remove invalid dates by doing the
# a vectorized Date conversion, then removing records with NA dates
snow_dat$date <- as.Date(snow_dat$date)
snow_dat <- snow_dat %>% filter(!is.na(date))
# having the year extracted is handy for filtering
snow_dat$year <- format(snow_dat$date, "%Y")
# since the Winter season in the US spans years, we have to work with the
# "day of year" and then munge it for Jan-May dates (we assume the first snowfall
# was never in June, but is prbly not not be true for Alaska, so user-beware)
snow_dat$doy <- as.numeric(format(snow_dat$date, "%j"))
snow_dat$doy <- ifelse(snow_dat$doy<=180,
snow_dat$doy + as.numeric(format(as.Date(sprintf("%s-12-31", snow_dat$year)), "%j")),
snow_dat$doy)
# now the fun begins
first <- snow_dat %>%
filter(value>0) %>% # ignore 0 values
filter(date>=as.Date("1950-01-01")) %>% # start at 1950 (arbitrary)
merge(stations, by="coop_id", all.x=TRUE) %>% # merge station details
group_by(coop_id, year) %>% # group by station and year
arrange(doy) %>% # sort by our munged day of year
filter(row_number(doy) == 1) %>% # grab the first entry by group
select(name, state, date, value, doy) # we only need some variables
# make a nice title for the visualization
title_1 <- sprintf("First observed snowfall (historical) at %s, %s",
str_to_title(unique(first$name)),
unique(first$state))
# an overused plot by me (Rudis), but I really like the dot-line charts and we
# add a twist by using a snowflake for the point and use icy blue colors
gg <- ggplot(first, aes(y=year, x=doy))
gg <- gg + geom_segment(aes(xend=min(first$doy)-20, yend=year), color="#9ecae1", size=0.25)
gg <- gg + geom_point(aes(color=coop_id), shape=8, size=3, color="#3182bd")
gg <- gg + geom_text(aes(label=format(date, "%b-%d")), size=3, hjust=-0.2)
gg <- gg + scale_x_continuous(expand=c(0, 0), limits=c(min(first$doy)-20, max(first$doy)+20))
gg <- gg + labs(x=NULL, y=NULL, title=title_1)
gg <- gg + theme_bw()
gg <- gg + theme(legend.position="none")
gg <- gg + theme(panel.grid=element_blank())
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(axis.ticks.x=element_blank())
gg <- gg + theme(axis.ticks.y=element_blank())
gg <- gg + theme(axis.text.x=element_blank())
gg <- gg + theme(axis.text.y=element_text(color="#08306b"))
by_year <- gg
# we're going to pair the dot-line plot with a boxplot, but I also want to
# give some indication of the most likely range for the first snowfall, so
# we grab the quartiles via summary and use them to annotate the second graph
wx_range <- summary(as.Date(format(first$date, "2013-%m-%d")))
names(wx_range) <- NULL
min_wx <- gsub("2013-", "", wx_range[2])
max_wx <- gsub("2013-", "", wx_range[5])
title_2 <- sprintf("Most likely first snowfall will be between %s & %s", min_wx, max_wx)
# we use a trick to line up the box plot with the dot-line plot by
# using the same character width y-axis labels but making them the background
# color (in this case, white) and keeping the x-axis limits the same. there
# may be another way to do this but this is quick, and easy to remember.
# a violin plot would work here as well
gg <- ggplot(first %>% mutate(name="0000"), aes(name, doy))
gg <- gg + geom_boxplot(fill="#3182bd", color="#08306b", outlier.colour="#08306b")
gg <- gg + scale_y_continuous(expand=c(0, 0),
limits=c(min(first$doy)-20, max(first$doy)+20))
gg <- gg + coord_flip()
gg <- gg + labs(x=NULL, y=NULL, title=title_2)
gg <- gg + theme_bw()
gg <- gg + theme(legend.position="none")
gg <- gg + theme(panel.grid=element_blank())
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(axis.ticks.x=element_blank())
gg <- gg + theme(axis.text.x=element_blank())
gg <- gg + theme(axis.ticks.y=element_line(color="white"))
gg <- gg + theme(axis.text.y=element_text(color="white"))
gg <- gg + theme(plot.title=element_text(size=11))
box_wx <- gg
# final presentation
grid.arrange(by_year, box_wx, nrow=2, heights=unit(c(0.9, 0.1), "npc"))
# BONUS - Compare with Bellingham, WA -------------------------------------
# we'll repeat the analysis, looking at Bellingham, WA
our_station <- closestStation(stations, locations[2,]$lat, locations[2,]$long)$coop_id
# and, we're only looking for a particular station
snow_bellingham <- grep(paste0("^", our_station), snow, value=TRUE)
# now that we have those, we'll make our data.table by iterating over each
# SNOW record and build a data.table from the daily records, then combine
# all of the SNOW records into one data.table
snow_dat <- rbindlist(pblapply(snow_bellingham, function(x) {
rbindlist(lapply(1:31, function(i) {
# record format described here:
# http://cdiac.ornl.gov/ftp/ushcn_daily/data_format.txt
start <- 17 + (i-1)*8
list(coop_id=substr(x, 1, 6),
date=sprintf("%s-%02d-%02d", substr(x, 7, 10), as.numeric(substr(x, 11, 12)), i),
element=substr(x, 13, 16),
value=as.numeric(substr(x, start, start+4)),
mflag=substr(x, start+5, start+5),
qflag=substr(x, start+6, start+6),
sflag=substr(x, start+7, start+7))
}))
}))
# -9999 indicates no data or invalid data, so ignore those records
# we could also make them NA, but that's not necessary for this exercise
snow_dat <- snow_dat %>% filter(value != -9999)
# since the data file has 31 days for each records regardless of whether
# that's valid or not we do a shortcut to remove invalid dates by doing the
# a vectorized Date conversion, then removing records with NA dates
snow_dat$date <- as.Date(snow_dat$date)
snow_dat <- snow_dat %>% filter(!is.na(date))
# having the year extracted is handy for filtering
snow_dat$year <- format(snow_dat$date, "%Y")
# since the Winter season in the US spans years, we have to work with the
# "day of year" and then munge it for Jan-May dates (we assume the first snowfall
# was never in June, but is prbly not not be true for Alaska, so user-beware)
snow_dat$doy <- as.numeric(format(snow_dat$date, "%j"))
snow_dat$doy <- ifelse(snow_dat$doy<=180,
snow_dat$doy + as.numeric(format(as.Date(sprintf("%s-12-31", snow_dat$year)), "%j")),
snow_dat$doy)
# now the fun begins
first <- snow_dat %>%
filter(value>0) %>% # ignore 0 values
filter(date>=as.Date("1950-01-01")) %>% # start at 1950 (arbitrary)
merge(stations, by="coop_id", all.x=TRUE) %>% # merge station details
group_by(coop_id, year) %>% # group by station and year
arrange(doy) %>% # sort by our munged day of year
filter(row_number(doy) == 1) %>% # grab the first entry by group
select(name, state, date, value, doy) # we only need some variables
# make a nice title for the visualization
title_1 <- sprintf("First observed snowfall (historical) at %s, %s",
str_to_title(unique(first$name)),
unique(first$state))
# an overused plot by me (Rudis), but I really like the dot-line charts and we
# add a twist by using a snowflake for the point and use icy blue colors
gg <- ggplot(first, aes(y=year, x=doy))
gg <- gg + geom_segment(aes(xend=min(first$doy)-20, yend=year), color="#9ecae1", size=0.25)
gg <- gg + geom_point(aes(color=coop_id), shape=8, size=3, color="#3182bd")
gg <- gg + geom_text(aes(label=format(date, "%b-%d")), size=3, hjust=-0.2)
gg <- gg + scale_x_continuous(expand=c(0, 0), limits=c(min(first$doy)-20, max(first$doy)+20))
gg <- gg + labs(x=NULL, y=NULL, title=title_1)
gg <- gg + theme_bw()
gg <- gg + theme(legend.position="none")
gg <- gg + theme(panel.grid=element_blank())
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(axis.ticks.x=element_blank())
gg <- gg + theme(axis.ticks.y=element_blank())
gg <- gg + theme(axis.text.x=element_blank())
gg <- gg + theme(axis.text.y=element_text(color="#08306b"))
by_year_bonus <- gg
# we're going to pair the dot-line plot with a boxplot, but I also want to
# give some indication of the most likely range for the first snowfall, so
# we grab the quartiles via summary and use them to annotate the second graph
wx_range <- summary(as.Date(format(first$date, "2013-%m-%d")))
names(wx_range) <- NULL
min_wx <- gsub("2013-", "", wx_range[2])
max_wx <- gsub("2013-", "", wx_range[5])
title_2 <- sprintf("Most likely first snowfall will be between %s & %s", min_wx, max_wx)
# we use a trick to line up the box plot with the dot-line plot by
# using the same character width y-axis labels but making them the background
# color (in this case, white) and keeping the x-axis limits the same. there
# may be another way to do this but this is quick, and easy to remember.
# a violin plot would work here as well
gg <- ggplot(first %>% mutate(name="0000"), aes(name, doy))
gg <- gg + geom_boxplot(fill="#3182bd", color="#08306b", outlier.colour="#08306b")
gg <- gg + scale_y_continuous(expand=c(0, 0),
limits=c(min(first$doy)-20, max(first$doy)+20))
gg <- gg + coord_flip()
gg <- gg + labs(x=NULL, y=NULL, title=title_2)
gg <- gg + theme_bw()
gg <- gg + theme(legend.position="none")
gg <- gg + theme(panel.grid=element_blank())
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(axis.ticks.x=element_blank())
gg <- gg + theme(axis.text.x=element_blank())
gg <- gg + theme(axis.ticks.y=element_line(color="white"))
gg <- gg + theme(axis.text.y=element_text(color="white"))
gg <- gg + theme(plot.title=element_text(size=11))
box_wx_bonus <- gg
# Side by Side Comparison -------------------------------------------------
# Note: npc == normalised parent coordinates
wa_paradise <- arrangeGrob(by_year, box_wx, nrow=2, heights=unit(c(0.9, 0.1), "npc"))
wa_bellingham <- arrangeGrob(by_year_bonus, box_wx_bonus, nrow=2, heights=unit(c(0.9, 0.1), "npc"))
grid.arrange(wa_paradise, wa_bellingham, ncol=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment