Skip to content

Instantly share code, notes, and snippets.

@TonyLadson
Created July 26, 2016 04:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TonyLadson/9e65e5fcdb81d0e2aacbbbae0102c105 to your computer and use it in GitHub Desktop.
Save TonyLadson/9e65e5fcdb81d0e2aacbbbae0102c105 to your computer and use it in GitHub Desktop.
library(stringr)
library(readr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(scales)
library(treemap)
#__________________________________________________________________________________________
# Load in data and pre-processing
# At a minimum the data set needs a column of dates (time stamp), flows and quality codes for each time stamp.
# path to my data
my_path <- '/Users/anthonyladson/Dropbox/Projects/N586-Loads-2016/Analysis/data'
my_file_1 <- '229135A-229143A_LVL_FLW_1975-98.CSV'
my_file_2 <- '229135A-229143A_LVL_FLW_1999-2016.CSV'
fname_1 <- str_c(my_path, my_file_1, sep = '/')
yarra_1 <- read_csv(fname_1, skip = 11766, col_names = FALSE ) # Chandler data starts at row 11766
yarra_1
tail(yarra_1)
glimpse(yarra_1)
fname_2 <- str_c(my_path, my_file_2, sep = '/')
yarra_2 <- read_csv(fname_2, skip = 5, col_names = FALSE )
yarra_2
# Merge the two data sets
yarra <- bind_rows(yarra_1, yarra_2)
# Function to calculate the number of seconds since Jan 1 in a particular year
ysec <- function(x){
as.numeric(difftime(x, floor_date(x, unit = 'year'), units="secs"))
}
yarra <- yarra %>%
select(-10) %>%
rename(my_datetime = X1,
level_229135A = X2,
level_229135A_qcode = X3,
discharge_229135A = X4,
discharge_229135A_qcode = X5,
level_229143A = X6,
level_229143A_qcode = X7,
discharge_229143A = X8,
discharge_229143A_qcode = X9)%>%
mutate(my_datetime = parse_date_time(my_datetime, orders = 'dmy HM')) %>%
mutate(my.yday = yday(my_datetime)) %>%
mutate(my.ysec = ysec(my_datetime) ) %>%
mutate(my.year = year(my_datetime))
# Check on time step
yarra %>%
mutate(time_step = my_datetime - lag(my_datetime)) %>%
count(time_step)
# Source: local data frame [2 x 2]
#
# time_step n
# <S3: difftime> <int>
# 1 12 mins 1788346
# 2 NA mins 1
# Constant 12 min time step so Ok
#_____________________________________________________________________________________________
# Box plot by quality code
ggplot(yarra, aes(x = factor(discharge_229143A_qcode), y = discharge_229143A, fill = factor(discharge_229143A_qcode))) +
geom_boxplot() +
scale_x_discrete(name = "Quality code") +
scale_y_log10(bquote('Discharge (m' ^3 *'s' ^-1 *')'), labels = comma) +
scale_fill_discrete(name = "Quality code") +
theme_bw() +
theme(
panel.background = element_rect(fill="gray98"),
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)),
axis.text.x = element_text(colour="grey20",size=14),
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)),
axis.text.y = element_text(colour="grey20",size=14),
legend.title = element_text(colour="grey20",size=12),
legend.position = c(0.9, 0.5),
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left
# Time series coloured by quality code
# set up locations of month starts for graph labelling
month_start <- ymd_hms('1970-01-01_00:00:00') %m+% months(seq(0,11,3)) # label every 3rd month
month_start <- as.numeric(month_start)
ggplot(yarra, aes(x = my.ysec, y = discharge_229143A, color = factor(discharge_229143A_qcode))) +
geom_point(size = 0.1) +
facet_wrap(~my.year) +
scale_colour_discrete(name = "Quality code") +
guides(colour = guide_legend(override.aes = list(size=5))) +
scale_y_continuous(bquote('Discharge (m' ^3 *'s' ^-1 *')'), labels = comma) +
scale_x_continuous(name="Month", breaks=month_start, labels=month.abb[seq(1,12,3)]) +
theme_bw() +
theme(
panel.background = element_rect(fill="gray98"),
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)),
axis.text.x = element_text(colour="grey20",size=14),
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)),
axis.text.y = element_text(colour="grey20",size=14),
legend.title = element_text(colour="grey20",size=12),
axis.text.x=element_text(size=rel(0.6)),
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left
# Time series coloured by quality code - showing single code
ggplot(yarra, aes(x = my.ysec, y = discharge_229143A)) +
geom_line(colour = alpha('grey', 0.5)) +
geom_point(data = subset(yarra, subset = discharge_229143A_qcode == 120), size = 0.1, colour = 'red') +
facet_wrap(~my.year) +
scale_y_continuous(bquote('Discharge (m' ^3 *'s' ^-1 *')'), labels = comma) +
scale_x_continuous(name="Month", breaks=month_start, labels=month.abb[seq(1,12,3)]) +
ggtitle('Quality code 120') +
theme_bw() +
theme(
panel.background = element_rect(fill="gray98"),
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)),
axis.text.x = element_text(colour="grey20",size=14),
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)),
axis.text.y = element_text(colour="grey20",size=14),
legend.title = element_text(colour="grey20",size=12),
axis.text.x=element_text(size=rel(0.6)),
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left
# Time series coloured by quality code - showing single code and single year
ggplot(data = subset(yarra, subset = my.year == 1989), aes(x = my.ysec, y = discharge_229143A)) +
geom_line(colour = alpha('black', 0.9)) +
geom_point(data = subset(yarra, subset = discharge_229143A_qcode == 120 & my.year == 1989), size = 0.1, colour = 'red') +
#facet_wrap(~my.year) +
scale_y_continuous(bquote('Discharge (m' ^3 *'s' ^-1 *')'), labels = comma) +
scale_x_continuous(name="Month", breaks=month_start, labels=month.abb[seq(1,12,3)]) +
ggtitle('Quality code 120') +
theme_bw() +
theme(
panel.background = element_rect(fill="gray98"),
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)),
axis.text.x = element_text(colour="grey20",size=14),
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)),
axis.text.y = element_text(colour="grey20",size=14),
legend.title = element_text(colour="grey20",size=12),
axis.text.x=element_text(size=rel(0.6)),
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left
# Time series coloured by quality code - showing single code, single year, selected months
month_start <- ymd_hms('1970-01-01_00:00:00') %m+% months(0:11) # label
month_start <- as.numeric(month_start)
yarra <- yarra %>%
mutate(my.month = month(my_datetime))
plot.month <- c(10,11)
plot.year <- 1989
transparent_legend = theme(
legend.background = element_rect(fill = "transparent"),
legend.key = element_rect(fill = "transparent",
color = "transparent"))
# Thanks to https://hopstat.wordpress.com/2016/02/18/how-i-build-up-a-ggplot2-figure/
ggplot(data = subset(yarra, subset = my.year %in% plot.year & my.month %in% plot.month), aes(x = my.ysec, y = discharge_229143A)) +
geom_point(data = subset(yarra, subset = my.year %in% 1989 & my.month %in% plot.month ), aes(colour = factor(discharge_229143A_qcode)), size = 0.1) +
scale_y_continuous(bquote('Discharge (m' ^3 *'s' ^-1 *')'), labels = comma) +
scale_x_continuous(name="Month", breaks=month_start, labels=month.abb) +
guides(colour = guide_legend(title = 'Quality code', override.aes = list(size=5))) +
theme_bw() +
transparent_legend +
theme(
panel.background = element_rect(fill="gray98"),
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)),
axis.text.x = element_text(colour="grey20",size=14),
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)),
axis.text.y = element_text(colour="grey20",size=14),
legend.title = element_text(colour="grey20",size=12),
legend.position = c(0.9, 0.5),
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left
# Missing data
# Time series coloured by quality code
# set up locations of month starts for graph labelling
month_start <- ymd_hms('1970-01-01_00:00:00') %m+% months(seq(0,11,3)) # label every 3rd month
month_start <- as.numeric(month_start)
yarra <- yarra %>%
mutate(my.month = month(my_datetime)) %>%
mutate(flow_missing = ifelse(is.na(discharge_229143A), -50, discharge_229143A)) # code missing data as -50
ggplot(yarra, aes(x = my.ysec, y = flow_missing, color = factor(discharge_229143A_qcode))) +
geom_point(size = 0.1) +
facet_wrap(~my.year) +
scale_colour_discrete(name = "Quality code") +
guides(colour = guide_legend(override.aes = list(size=5))) +
scale_y_continuous(bquote('Discharge (m' ^3 *'s' ^-1 *')'), labels = comma) +
scale_x_continuous(name="Month", breaks=month_start, labels=month.abb[seq(1,12,3)]) +
theme_bw() +
theme(
panel.background = element_rect(fill="gray98"),
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)),
axis.text.x = element_text(colour="grey20",size=14),
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)),
axis.text.y = element_text(colour="grey20",size=14),
legend.title = element_text(colour="grey20",size=12),
axis.text.x=element_text(size=rel(0.6)),
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left
# Tile plot
# set up locations of month starts for graph labelling
month_start <- ymd_hms('1970-01-01_00:00:00') %m+% months(0:11) #
month_start <- as.numeric(month_start)
# Labels for years (y-axis)
year.labels <- yarra %>%
distinct(my.year) %>%
.$my.year
ggplot(yarra, aes(x = my.ysec, y = my.year)) +
geom_tile(aes(fill = factor(discharge_229143A_qcode) )) +
scale_fill_discrete(name='Quality Code') +
scale_x_continuous(name="Month", breaks=month_start, labels=month.abb) +
scale_y_continuous(name = "Year", breaks=year.labels, labels = year.labels) +
theme_bw() +
theme(
panel.background = element_rect(fill="gray98"),
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)),
axis.text.x = element_text(colour="grey20",size=14),
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)),
axis.text.y = element_text(colour="grey20",size=14),
legend.title = element_text(colour="grey20",size=12),
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left
# Tree map
qcode <- as.data.frame(table(yarra$level_229135A_qcode))
qcode %>%
arrange(desc(Freq))
# Var1 Freq
# 1 1 1293515
# 2 2 219935
# 3 50 154328
# 4 255 76187
# 5 151 16527
# 6 120 12820
# 7 104 5246
# 8 76 4683
# 9 150 2596
# 10 101 942
# 11 105 825
# 12 201 568
# 13 82 123
# 14 75 52
quartz()
treemap(qcode, index="Var1", vSize="Freq", title="", algorithm="pivotSize")
# How are missing values coded?
yarra %>%
filter(is.na(discharge_229143A)) %>%
count(discharge_229143A_qcode)
# discharge_229143A_qcode n
# <int> <int>
# 1 151 1127
# 2 180 712
# 3 255 16747
# Make a table of the different codes that are used
qcode_table <- yarra %>%
count(discharge_229143A_qcode) %>%
mutate(pc = 100 * n/sum(n)) %>%
mutate(pc = round(pc, 2)) %>%
mutate(n = format(n, big.mark = ',')) %>%
rename(`Proportion (%)` = pc) %>%
rename(`Quality code` = discharge_229143A_qcode) %>%
rename('Number' = n)
# write to clipboard
clip <- pipe("pbcopy", "w")
write.table(qcode_table, file=clip, sep = '\t', row.names = FALSE, quote = FALSE)
close(clip)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment