Last active
September 8, 2016 01:15
-
-
Save TonyLadson/843410a1f9e1f1bbf1f48eeeee79a007 to your computer and use it in GitHub Desktop.
Script to analyse hourly flow data see: https://tonyladson.wordpress.com/2016/05/30/script-to-analyse-hourly-flow-data/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Hourly data from Victorian monitoring site | |
# This script takes the standard download of hourly max, min, mean data from | |
# http://data.water.vic.gov.au/monitoring.htm | |
# File is downloaded as 'gaugenumber.csv' | |
#remove(list = objects()) | |
library(readr) | |
library(stringr) | |
library(dplyr) | |
library(lubridate) | |
library(ggplot2) | |
library(scales) | |
library(ggrepel) | |
library(RcppRoll) | |
#________________________________________________________ | |
# | |
# Helper functions | |
# | |
# | |
# Shadow shift | |
# Courtesy Nicholas Tierney | |
# https://github.com/njtierney/ggmissing/blob/master/R/shadow_shift.R | |
shadow_shift <- function(x){ | |
xrange <- max(x, na.rm = TRUE) - min(x, na.rm = TRUE) | |
xmin <- min(x, na.rm = TRUE) | |
# create the "jitter" to be added around the points. | |
xrunif <- (runif(length(x))-0.5)*xrange*0.05 | |
ifelse(is.na(x), | |
# add the jitter around the those values that are missing | |
yes = xmin-xrange*0.1 + xrunif, | |
no = x) | |
} # close function | |
# | |
Value_atMax <- function(vec_1, vec_2) { | |
# get the value of vec_1 at the position of the maximum of vec_2 | |
# this handles the case where vec_2 is all missing | |
if(all(is.na(vec_2))) return(NA) | |
vec_1[which.max(vec_2)] | |
} | |
# Helper function to count missing | |
N_missing <- function(x){ # sum number of missing values | |
sum(is.na(x)) | |
} | |
#__________________________________________________________________________________________ | |
# Download the hour data from | |
# http://data.water.vic.gov.au/monitoring.htm | |
# or a specific site | |
# http://data.water.vic.gov.au/monitoring.htm?ppbm=222200&rs&1&rsdf_org | |
# path to the file | |
# my.path <- 'path to file' | |
# my.file <- 'name of file | |
fname <- str_c(my.path, my.file, sep = '/') | |
# Read in the data | |
# 1. Find where the data starts by searching for the line that begins with 'Datetime' | |
heading.line <- which(str_detect(read_lines(fname, n_max = 50), 'Datetime')) # searching the first 50 lines | |
if(!any(heading.line)) stop('Heading not found') # trap error if heading not found | |
flow <- read_csv(fname, skip = heading.line - 1, n_max = Inf) | |
# check if there are any problems reading in the data | |
# Often, the final row is read in incorrectly | |
my.problems <- problems(flow) | |
n.problems <- nrow(my.problems) | |
if(n.problems > 2) stop('Problems reading in flow file (not just the final row)') | |
if(n.problems == 1) { # try to fix the problem, its likely the final row has been misread and can be removed | |
if(my.problems$row == nrow(flow)) { # is there a problem with the final row | |
flow <- flow[-nrow(flow), ] # remove final row | |
} else { | |
stop('Problems reading in flow file (single problem, not the final row)') # otherwise stop | |
} | |
} | |
# name columns | |
names(flow) <- c('date.time', 'flow.mean', 'flow.mean.qc', 'flow.min', 'flow.min.qc', 'flow.max', 'flow.max.qc') | |
# parse dates | |
flow <- flow %>% | |
mutate(date.time = parse_date_time(date.time, orders = 'dmy H:M:S')) | |
flow.annual <- flow %>% | |
mutate(flow.max = flow.max) %>% | |
mutate(flow.year = year(date.time)) %>% | |
mutate(flow.month = month(date.time))%>% | |
group_by(flow.year) %>% | |
summarise(max.date = Value_atMax(date.time, flow.max), | |
max.flow = max(flow.max, na.rm = FALSE), | |
max.month = Value_atMax(flow.month, flow.max), | |
n.days = n()/(24), | |
n.missing = N_missing(flow.max), | |
pc.missing = 100*n.missing/n()) | |
View(flow.annual) | |
# flow data frame is now ready for further analysis | |
# Flow example | |
# 1. Plot annual peaks | |
flow.annual %>% | |
mutate(max.flow.shifted = shadow_shift(max.flow)) %>% | |
ggplot(aes(x = flow.year, y = max.flow.shifted, colour = is.na(max.flow))) + geom_point() + | |
xlab('Year') + | |
# scale_y_continuous(bquote('Peak discharge (m' ^3 *'s' ^-1 *')'), labels = comma) + | |
scale_y_continuous(name = 'Peak flow', labels = comma) + | |
scale_colour_discrete(name = 'Missing') + | |
theme_bw() + | |
theme( | |
panel.background = element_rect(fill="gray98"), | |
axis.title.x = element_text(colour="grey20", size=14, margin=margin(20,0,0,0)), | |
axis.text.x = element_text(colour="grey20",size=12), | |
axis.title.y = element_text(colour="grey20",size=14, margin = margin(0,20,0,0)), | |
axis.text.y = element_text(colour="grey20",size=14), | |
legend.title = element_text(colour="grey20",size=12), | |
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left | |
# Time between peaks | |
# if this is too short the peaks are not independent | |
flow.annual %>% | |
mutate(time_diff =difftime(max.date, lag(max.date))) %>% | |
arrange(time_diff) %>% | |
View | |
# 2. Time of year polar plot | |
Date_to_Angle <- function(my.date){ | |
start.year <- floor_date(my.date, 'year') | |
end.year <- ceiling_date(my.date, 'year') | |
length.year <- as.numeric(difftime(end.year, start.year, units = 'sec')) | |
time.since.year.start <- as.numeric(difftime(my.date, start.year, units = 'sec')) | |
2 * pi * time.since.year.start/length.year # date as angle | |
} | |
# Angles for month start | |
month.start <- yday(seq(as.Date('2010-01-01'),as.Date('2010-12-31'), by='1 months')) | |
month.start.radians <- 2*pi*month.start/365.25 | |
# Labels for the largest n points | |
labeled.dat <- flow.annual %>% | |
mutate(angle = Date_to_Angle(max.date)) %>% | |
arrange(desc(max.flow)) %>% | |
slice(1:5) | |
flow.annual %>% | |
mutate(angle = Date_to_Angle(max.date)) %>% | |
ggplot(aes(x=angle, y=max.flow)) + | |
coord_polar('x', start = 0, direction = 1) + | |
geom_segment(aes(y = 0, xend = angle, yend = max.flow)) + | |
scale_x_continuous(name = 'Floods - time of year', breaks = month.start.radians, labels = month.name, limits = c(0, 2*pi)) + | |
scale_y_continuous(name = 'Peak flow', labels = comma) + | |
#scale_y_continuous(name = bquote('Peak discharge (m' ^3 *'s' ^-1 *')')) + | |
geom_text_repel(data = labeled.dat, aes(x = angle, y = max.flow + 100000, label = as.character(flow.year)), segment.size = 0) + | |
theme_bw() + | |
theme( | |
#panel.background = element_rect(fill="gray99"), | |
axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)), | |
axis.text.x = element_text(colour="grey20",size=10), | |
axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)), | |
axis.text.y = element_text(colour="grey20",size=14), | |
legend.title = element_text(colour="grey20",size=12), | |
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left | |
# 3. Annual maximum three day volumes | |
# Use a rolling 72 hour mean | |
# Get max for each year | |
flow %>% | |
mutate(vol_3day = roll_mean(flow.mean, n = 72, fill = NA, na.rm = TRUE )) %>% | |
mutate(flow.year = year(date.time)) %>% | |
group_by(flow.year) %>% | |
summarise(vol.max = 3 * max(vol_3day)) %>% | |
View | |
# when does the maximum volume occur | |
flow %>% | |
mutate(vol_3day = roll_mean(flow.mean, n = 72, fill = NA, na.rm = TRUE )) %>% | |
mutate(flow.year = year(date.time)) %>% | |
group_by(flow.year) %>% | |
summarise(vol.max = 3 * max(vol_3day, na.rm = TRUE), vol.max.date = Value_atMax(date.time, vol_3day)) %>% | |
View | |
# 4. Empirical frequency curve | |
# Cunnane plotting position is recommended in Australian Rainfall and Runoff | |
pp_cunnane <- function(x){ | |
n = length(x) | |
rank(x)/(n + 0.2) | |
} | |
my_AEP <- c(0.5, 1,2,5,10, 20, 50) | |
my_labels <- str_c(my_AEP, '%') | |
my_breaks <- 1 - AEP/100 | |
flow.annual %>% | |
filter(!is.na(max.flow)) %>% | |
mutate(AEP = pp_cunnane(max.flow)) %>% | |
ggplot(aes(x = AEP, y = max.flow/86.4)) + geom_point() + | |
scale_y_log10(name = 'Peak flow', | |
breaks = c(1e+1, 1e+2, 1e+3, 1e+4), | |
limits = c(10, 1e+4), | |
labels = comma) + | |
scale_x_continuous(trans = "probit", breaks = my_breaks , labels = my_labels )+ | |
theme_gray(base_size = 16) | |
# 5. Plots for indiviual years or time periods | |
# start_time <- ymd_hms('1971-02-06 17:00:00') | |
flow %>% | |
filter(date.time > ymd('1971-01-01', tz = 'UTC') & date.time < ymd('1971-02-20', tz = 'UTC') ) %>% | |
mutate(flow.max.shifted = shadow_shift(flow.max)) %>% | |
ggplot(aes(x = date.time, y = flow.max.shifted/86.4, colour = is.na(flow.max))) + geom_point() + | |
xlab('Year') + | |
# scale_y_continuous(bquote('Peak discharge (m' ^3 *'s' ^-1 *')'), labels = comma) + | |
scale_y_continuous(name = 'Peak flow', labels = comma) + | |
scale_colour_discrete(name = 'Missing') + | |
theme_bw() + | |
theme( | |
panel.background = element_rect(fill="gray98"), | |
axis.title.x = element_text(colour="grey20", size=14, margin=margin(20,0,0,0)), | |
axis.text.x = element_text(colour="grey20",size=12), | |
axis.title.y = element_text(colour="grey20",size=14, margin = margin(0,20,0,0)), | |
axis.text.y = element_text(colour="grey20",size=14), | |
legend.title = element_text(colour="grey20",size=12), | |
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment