Created
July 26, 2016 04:56
-
-
Save TonyLadson/9e65e5fcdb81d0e2aacbbbae0102c105 to your computer and use it in GitHub Desktop.
Quality codes in hydrologic data https://tonyladson.wordpress.com/2016/07/26/quality-codes-in-hydrologic-data-ii/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringr) | |
library(readr) | |
library(dplyr) | |
library(lubridate) | |
library(ggplot2) | |
library(scales) | |
library(treemap) | |
#__________________________________________________________________________________________ | |
# Load in data and pre-processing | |
# At a minimum the data set needs a column of dates (time stamp), flows and quality codes for each time stamp. | |
# path to my data | |
my_path <- '/Users/anthonyladson/Dropbox/Projects/N586-Loads-2016/Analysis/data' | |
my_file_1 <- '229135A-229143A_LVL_FLW_1975-98.CSV' | |
my_file_2 <- '229135A-229143A_LVL_FLW_1999-2016.CSV' | |
fname_1 <- str_c(my_path, my_file_1, sep = '/') | |
yarra_1 <- read_csv(fname_1, skip = 11766, col_names = FALSE ) # Chandler data starts at row 11766 | |
yarra_1 | |
tail(yarra_1) | |
glimpse(yarra_1) | |
fname_2 <- str_c(my_path, my_file_2, sep = '/') | |
yarra_2 <- read_csv(fname_2, skip = 5, col_names = FALSE ) | |
yarra_2 | |
# Merge the two data sets | |
yarra <- bind_rows(yarra_1, yarra_2) | |
# Function to calculate the number of seconds since Jan 1 in a particular year | |
ysec <- function(x){ | |
as.numeric(difftime(x, floor_date(x, unit = 'year'), units="secs")) | |
} | |
yarra <- yarra %>% | |
select(-10) %>% | |
rename(my_datetime = X1, | |
level_229135A = X2, | |
level_229135A_qcode = X3, | |
discharge_229135A = X4, | |
discharge_229135A_qcode = X5, | |
level_229143A = X6, | |
level_229143A_qcode = X7, | |
discharge_229143A = X8, | |
discharge_229143A_qcode = X9)%>% | |
mutate(my_datetime = parse_date_time(my_datetime, orders = 'dmy HM')) %>% | |
mutate(my.yday = yday(my_datetime)) %>% | |
mutate(my.ysec = ysec(my_datetime) ) %>% | |
mutate(my.year = year(my_datetime)) | |
# Check on time step | |
yarra %>% | |
mutate(time_step = my_datetime - lag(my_datetime)) %>% | |
count(time_step) | |
# Source: local data frame [2 x 2] | |
# | |
# time_step n | |
# <S3: difftime> <int> | |
# 1 12 mins 1788346 | |
# 2 NA mins 1 | |
# Constant 12 min time step so Ok | |
#_____________________________________________________________________________________________ | |
# Box plot by quality code | |
ggplot(yarra, aes(x = factor(discharge_229143A_qcode), y = discharge_229143A, fill = factor(discharge_229143A_qcode))) + | |
geom_boxplot() + | |
scale_x_discrete(name = "Quality code") + | |
scale_y_log10(bquote('Discharge (m' ^3 *'s' ^-1 *')'), labels = comma) + | |
scale_fill_discrete(name = "Quality code") + | |
theme_bw() + | |
theme( | |
panel.background = element_rect(fill="gray98"), | |
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)), | |
axis.text.x = element_text(colour="grey20",size=14), | |
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)), | |
axis.text.y = element_text(colour="grey20",size=14), | |
legend.title = element_text(colour="grey20",size=12), | |
legend.position = c(0.9, 0.5), | |
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left | |
# Time series coloured by quality code | |
# set up locations of month starts for graph labelling | |
month_start <- ymd_hms('1970-01-01_00:00:00') %m+% months(seq(0,11,3)) # label every 3rd month | |
month_start <- as.numeric(month_start) | |
ggplot(yarra, aes(x = my.ysec, y = discharge_229143A, color = factor(discharge_229143A_qcode))) + | |
geom_point(size = 0.1) + | |
facet_wrap(~my.year) + | |
scale_colour_discrete(name = "Quality code") + | |
guides(colour = guide_legend(override.aes = list(size=5))) + | |
scale_y_continuous(bquote('Discharge (m' ^3 *'s' ^-1 *')'), labels = comma) + | |
scale_x_continuous(name="Month", breaks=month_start, labels=month.abb[seq(1,12,3)]) + | |
theme_bw() + | |
theme( | |
panel.background = element_rect(fill="gray98"), | |
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)), | |
axis.text.x = element_text(colour="grey20",size=14), | |
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)), | |
axis.text.y = element_text(colour="grey20",size=14), | |
legend.title = element_text(colour="grey20",size=12), | |
axis.text.x=element_text(size=rel(0.6)), | |
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left | |
# Time series coloured by quality code - showing single code | |
ggplot(yarra, aes(x = my.ysec, y = discharge_229143A)) + | |
geom_line(colour = alpha('grey', 0.5)) + | |
geom_point(data = subset(yarra, subset = discharge_229143A_qcode == 120), size = 0.1, colour = 'red') + | |
facet_wrap(~my.year) + | |
scale_y_continuous(bquote('Discharge (m' ^3 *'s' ^-1 *')'), labels = comma) + | |
scale_x_continuous(name="Month", breaks=month_start, labels=month.abb[seq(1,12,3)]) + | |
ggtitle('Quality code 120') + | |
theme_bw() + | |
theme( | |
panel.background = element_rect(fill="gray98"), | |
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)), | |
axis.text.x = element_text(colour="grey20",size=14), | |
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)), | |
axis.text.y = element_text(colour="grey20",size=14), | |
legend.title = element_text(colour="grey20",size=12), | |
axis.text.x=element_text(size=rel(0.6)), | |
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left | |
# Time series coloured by quality code - showing single code and single year | |
ggplot(data = subset(yarra, subset = my.year == 1989), aes(x = my.ysec, y = discharge_229143A)) + | |
geom_line(colour = alpha('black', 0.9)) + | |
geom_point(data = subset(yarra, subset = discharge_229143A_qcode == 120 & my.year == 1989), size = 0.1, colour = 'red') + | |
#facet_wrap(~my.year) + | |
scale_y_continuous(bquote('Discharge (m' ^3 *'s' ^-1 *')'), labels = comma) + | |
scale_x_continuous(name="Month", breaks=month_start, labels=month.abb[seq(1,12,3)]) + | |
ggtitle('Quality code 120') + | |
theme_bw() + | |
theme( | |
panel.background = element_rect(fill="gray98"), | |
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)), | |
axis.text.x = element_text(colour="grey20",size=14), | |
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)), | |
axis.text.y = element_text(colour="grey20",size=14), | |
legend.title = element_text(colour="grey20",size=12), | |
axis.text.x=element_text(size=rel(0.6)), | |
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left | |
# Time series coloured by quality code - showing single code, single year, selected months | |
month_start <- ymd_hms('1970-01-01_00:00:00') %m+% months(0:11) # label | |
month_start <- as.numeric(month_start) | |
yarra <- yarra %>% | |
mutate(my.month = month(my_datetime)) | |
plot.month <- c(10,11) | |
plot.year <- 1989 | |
transparent_legend = theme( | |
legend.background = element_rect(fill = "transparent"), | |
legend.key = element_rect(fill = "transparent", | |
color = "transparent")) | |
# Thanks to https://hopstat.wordpress.com/2016/02/18/how-i-build-up-a-ggplot2-figure/ | |
ggplot(data = subset(yarra, subset = my.year %in% plot.year & my.month %in% plot.month), aes(x = my.ysec, y = discharge_229143A)) + | |
geom_point(data = subset(yarra, subset = my.year %in% 1989 & my.month %in% plot.month ), aes(colour = factor(discharge_229143A_qcode)), size = 0.1) + | |
scale_y_continuous(bquote('Discharge (m' ^3 *'s' ^-1 *')'), labels = comma) + | |
scale_x_continuous(name="Month", breaks=month_start, labels=month.abb) + | |
guides(colour = guide_legend(title = 'Quality code', override.aes = list(size=5))) + | |
theme_bw() + | |
transparent_legend + | |
theme( | |
panel.background = element_rect(fill="gray98"), | |
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)), | |
axis.text.x = element_text(colour="grey20",size=14), | |
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)), | |
axis.text.y = element_text(colour="grey20",size=14), | |
legend.title = element_text(colour="grey20",size=12), | |
legend.position = c(0.9, 0.5), | |
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left | |
# Missing data | |
# Time series coloured by quality code | |
# set up locations of month starts for graph labelling | |
month_start <- ymd_hms('1970-01-01_00:00:00') %m+% months(seq(0,11,3)) # label every 3rd month | |
month_start <- as.numeric(month_start) | |
yarra <- yarra %>% | |
mutate(my.month = month(my_datetime)) %>% | |
mutate(flow_missing = ifelse(is.na(discharge_229143A), -50, discharge_229143A)) # code missing data as -50 | |
ggplot(yarra, aes(x = my.ysec, y = flow_missing, color = factor(discharge_229143A_qcode))) + | |
geom_point(size = 0.1) + | |
facet_wrap(~my.year) + | |
scale_colour_discrete(name = "Quality code") + | |
guides(colour = guide_legend(override.aes = list(size=5))) + | |
scale_y_continuous(bquote('Discharge (m' ^3 *'s' ^-1 *')'), labels = comma) + | |
scale_x_continuous(name="Month", breaks=month_start, labels=month.abb[seq(1,12,3)]) + | |
theme_bw() + | |
theme( | |
panel.background = element_rect(fill="gray98"), | |
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)), | |
axis.text.x = element_text(colour="grey20",size=14), | |
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)), | |
axis.text.y = element_text(colour="grey20",size=14), | |
legend.title = element_text(colour="grey20",size=12), | |
axis.text.x=element_text(size=rel(0.6)), | |
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left | |
# Tile plot | |
# set up locations of month starts for graph labelling | |
month_start <- ymd_hms('1970-01-01_00:00:00') %m+% months(0:11) # | |
month_start <- as.numeric(month_start) | |
# Labels for years (y-axis) | |
year.labels <- yarra %>% | |
distinct(my.year) %>% | |
.$my.year | |
ggplot(yarra, aes(x = my.ysec, y = my.year)) + | |
geom_tile(aes(fill = factor(discharge_229143A_qcode) )) + | |
scale_fill_discrete(name='Quality Code') + | |
scale_x_continuous(name="Month", breaks=month_start, labels=month.abb) + | |
scale_y_continuous(name = "Year", breaks=year.labels, labels = year.labels) + | |
theme_bw() + | |
theme( | |
panel.background = element_rect(fill="gray98"), | |
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)), | |
axis.text.x = element_text(colour="grey20",size=14), | |
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)), | |
axis.text.y = element_text(colour="grey20",size=14), | |
legend.title = element_text(colour="grey20",size=12), | |
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left | |
# Tree map | |
qcode <- as.data.frame(table(yarra$level_229135A_qcode)) | |
qcode %>% | |
arrange(desc(Freq)) | |
# Var1 Freq | |
# 1 1 1293515 | |
# 2 2 219935 | |
# 3 50 154328 | |
# 4 255 76187 | |
# 5 151 16527 | |
# 6 120 12820 | |
# 7 104 5246 | |
# 8 76 4683 | |
# 9 150 2596 | |
# 10 101 942 | |
# 11 105 825 | |
# 12 201 568 | |
# 13 82 123 | |
# 14 75 52 | |
quartz() | |
treemap(qcode, index="Var1", vSize="Freq", title="", algorithm="pivotSize") | |
# How are missing values coded? | |
yarra %>% | |
filter(is.na(discharge_229143A)) %>% | |
count(discharge_229143A_qcode) | |
# discharge_229143A_qcode n | |
# <int> <int> | |
# 1 151 1127 | |
# 2 180 712 | |
# 3 255 16747 | |
# Make a table of the different codes that are used | |
qcode_table <- yarra %>% | |
count(discharge_229143A_qcode) %>% | |
mutate(pc = 100 * n/sum(n)) %>% | |
mutate(pc = round(pc, 2)) %>% | |
mutate(n = format(n, big.mark = ',')) %>% | |
rename(`Proportion (%)` = pc) %>% | |
rename(`Quality code` = discharge_229143A_qcode) %>% | |
rename('Number' = n) | |
# write to clipboard | |
clip <- pipe("pbcopy", "w") | |
write.table(qcode_table, file=clip, sep = '\t', row.names = FALSE, quote = FALSE) | |
close(clip) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment