TonyLadson/flow_script.R

## flow_script.R
# Hourly data from Victorian monitoring site

# This script takes the standard download of hourly max, min, mean data from
# http://data.water.vic.gov.au/monitoring.htm

# File is downloaded as 'gaugenumber.csv'

#remove(list = objects())


library(readr)
library(stringr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(scales)
library(ggrepel)
library(RcppRoll)


#________________________________________________________
#
# Helper functions
#
#
# Shadow shift
# Courtesy Nicholas Tierney
# https://github.com/njtierney/ggmissing/blob/master/R/shadow_shift.R

shadow_shift <- function(x){

  xrange <- max(x, na.rm = TRUE) - min(x, na.rm = TRUE)

  xmin <- min(x, na.rm = TRUE)

  # create the "jitter" to be added around the points.
  xrunif <- (runif(length(x))-0.5)*xrange*0.05

  ifelse(is.na(x),
         # add the jitter around the those values that are missing
         yes = xmin-xrange*0.1 + xrunif,
         no = x)

} # close function


#

Value_atMax <- function(vec_1, vec_2) {
  # get the value of vec_1 at the position of the maximum of vec_2
  # this handles the case where vec_2 is all missing
  if(all(is.na(vec_2))) return(NA)
  vec_1[which.max(vec_2)]
}


# Helper function to count missing
N_missing <- function(x){ # sum number of missing values
  sum(is.na(x))
}

#__________________________________________________________________________________________

# Download the hour data from
# http://data.water.vic.gov.au/monitoring.htm

# or a specific site
# http://data.water.vic.gov.au/monitoring.htm?ppbm=222200&rs&1&rsdf_org

# path to the file
# my.path <- 'path to file'
# my.file <- 'name of file
fname <- str_c(my.path, my.file, sep = '/')

# Read in the data

# 1. Find where the data starts by searching for the line that begins with 'Datetime'


heading.line <- which(str_detect(read_lines(fname, n_max = 50), 'Datetime')) # searching the first 50 lines
if(!any(heading.line)) stop('Heading not found') # trap error if heading not found


flow <- read_csv(fname, skip = heading.line - 1, n_max = Inf)

# check if there are any problems reading in the data
# Often, the final row is read in incorrectly

my.problems <- problems(flow)
n.problems <- nrow(my.problems)

if(n.problems > 2) stop('Problems reading in flow file (not just the final row)')

if(n.problems == 1) { # try to fix the problem, its likely the final row has been misread and can be removed
  if(my.problems$row == nrow(flow)) { # is there a problem with the final row
    flow <- flow[-nrow(flow), ] # remove final row
  } else {
    stop('Problems reading in flow file (single problem, not the final row)') # otherwise stop
  }
}

# name columns
names(flow) <- c('date.time', 'flow.mean', 'flow.mean.qc', 'flow.min', 'flow.min.qc', 'flow.max', 'flow.max.qc')

# parse dates

flow <- flow %>%
  mutate(date.time = parse_date_time(date.time, orders = 'dmy H:M:S'))


flow.annual <- flow %>%
  mutate(flow.max = flow.max) %>%
  mutate(flow.year = year(date.time)) %>%
  mutate(flow.month = month(date.time))%>%
  group_by(flow.year) %>%
  summarise(max.date = Value_atMax(date.time, flow.max),
            max.flow = max(flow.max, na.rm = FALSE),
            max.month = Value_atMax(flow.month, flow.max),
            n.days = n()/(24),
            n.missing = N_missing(flow.max),
            pc.missing = 100*n.missing/n())


View(flow.annual)

# flow data frame is now ready for further analysis
# Flow example

# 1.  Plot annual peaks

flow.annual %>%
  mutate(max.flow.shifted = shadow_shift(max.flow)) %>%
  ggplot(aes(x = flow.year, y = max.flow.shifted, colour = is.na(max.flow))) + geom_point() +
  xlab('Year') +
  # scale_y_continuous(bquote('Peak discharge (m' ^3 *'s' ^-1 *')'), labels = comma) +
  scale_y_continuous(name = 'Peak flow', labels = comma) +
  scale_colour_discrete(name = 'Missing') +
  theme_bw() +
  theme(
    panel.background = element_rect(fill="gray98"),
    axis.title.x = element_text(colour="grey20", size=14, margin=margin(20,0,0,0)),
    axis.text.x = element_text(colour="grey20",size=12),
    axis.title.y = element_text(colour="grey20",size=14, margin = margin(0,20,0,0)),
    axis.text.y = element_text(colour="grey20",size=14),
    legend.title = element_text(colour="grey20",size=12),
    plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left


# Time between peaks
# if this is too short the peaks are not independent

flow.annual %>%
  mutate(time_diff =difftime(max.date, lag(max.date))) %>%
  arrange(time_diff) %>%
  View


# 2. Time of year polar plot

Date_to_Angle <- function(my.date){

  start.year <- floor_date(my.date, 'year')
  end.year <- ceiling_date(my.date, 'year')
  length.year <- as.numeric(difftime(end.year, start.year, units = 'sec'))
  time.since.year.start <- as.numeric(difftime(my.date, start.year, units = 'sec'))

  2 * pi * time.since.year.start/length.year # date as angle

}

# Angles for month start

month.start <- yday(seq(as.Date('2010-01-01'),as.Date('2010-12-31'), by='1 months'))
month.start.radians <- 2*pi*month.start/365.25

# Labels for the largest n points
labeled.dat <- flow.annual %>%
  mutate(angle = Date_to_Angle(max.date)) %>%
  arrange(desc(max.flow)) %>%
  slice(1:5)


flow.annual %>%
  mutate(angle = Date_to_Angle(max.date)) %>%
  ggplot(aes(x=angle, y=max.flow)) +
  coord_polar('x', start = 0, direction = 1) +
  geom_segment(aes(y = 0, xend = angle, yend = max.flow)) +
  scale_x_continuous(name = 'Floods - time of year', breaks = month.start.radians, labels = month.name, limits = c(0, 2*pi)) +
  scale_y_continuous(name = 'Peak flow', labels = comma) +
  #scale_y_continuous(name = bquote('Peak discharge (m' ^3 *'s' ^-1 *')')) +
  geom_text_repel(data = labeled.dat, aes(x = angle, y = max.flow + 100000, label = as.character(flow.year)), segment.size = 0) +
  theme_bw() +
  theme(
    #panel.background = element_rect(fill="gray99"),
    axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)),
    axis.text.x = element_text(colour="grey20",size=10),
    axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)),
    axis.text.y = element_text(colour="grey20",size=14),
    legend.title = element_text(colour="grey20",size=12),
    plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left


# 3. Annual maximum three day volumes
# Use a rolling 72 hour mean
# Get max for each year

flow %>%
  mutate(vol_3day = roll_mean(flow.mean, n = 72, fill = NA, na.rm = TRUE )) %>%
  mutate(flow.year = year(date.time)) %>%
  group_by(flow.year) %>%
  summarise(vol.max = 3 * max(vol_3day)) %>%
  View

# when does the maximum volume occur

flow %>%
  mutate(vol_3day = roll_mean(flow.mean, n = 72, fill = NA, na.rm = TRUE )) %>%
  mutate(flow.year = year(date.time)) %>%
  group_by(flow.year) %>%
  summarise(vol.max = 3 * max(vol_3day, na.rm = TRUE),  vol.max.date = Value_atMax(date.time, vol_3day)) %>%
  View

# 4.  Empirical frequency curve

# Cunnane plotting position is recommended in Australian Rainfall and Runoff

pp_cunnane <- function(x){
  n = length(x)
  rank(x)/(n + 0.2)
}

my_AEP <- c(0.5, 1,2,5,10, 20, 50)
my_labels <- str_c(my_AEP, '%')
my_breaks <- 1 - AEP/100

flow.annual %>%
  filter(!is.na(max.flow)) %>%
  mutate(AEP = pp_cunnane(max.flow)) %>%
  ggplot(aes(x = AEP, y = max.flow/86.4)) + geom_point() +
  scale_y_log10(name = 'Peak flow',
                breaks = c(1e+1, 1e+2, 1e+3, 1e+4),
                limits = c(10, 1e+4),
                labels = comma) +
  scale_x_continuous(trans = "probit", breaks = my_breaks , labels = my_labels )+
  theme_gray(base_size = 16)

# 5. Plots for indiviual years or time periods

# start_time <- ymd_hms('1971-02-06 17:00:00')

flow %>%
  filter(date.time > ymd('1971-01-01', tz = 'UTC') & date.time < ymd('1971-02-20', tz = 'UTC') ) %>%
  mutate(flow.max.shifted = shadow_shift(flow.max)) %>%
  ggplot(aes(x = date.time, y = flow.max.shifted/86.4, colour = is.na(flow.max))) + geom_point() +
  xlab('Year') +
  # scale_y_continuous(bquote('Peak discharge (m' ^3 *'s' ^-1 *')'), labels = comma) +
  scale_y_continuous(name = 'Peak flow', labels = comma) +
  scale_colour_discrete(name = 'Missing') +
  theme_bw() +
  theme(
    panel.background = element_rect(fill="gray98"),
    axis.title.x = element_text(colour="grey20", size=14, margin=margin(20,0,0,0)),
    axis.text.x = element_text(colour="grey20",size=12),
    axis.title.y = element_text(colour="grey20",size=14, margin = margin(0,20,0,0)),
    axis.text.y = element_text(colour="grey20",size=14),
    legend.title = element_text(colour="grey20",size=12),
    plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left
	# Hourly data from Victorian monitoring site

	# This script takes the standard download of hourly max, min, mean data from
	# http://data.water.vic.gov.au/monitoring.htm

	# File is downloaded as 'gaugenumber.csv'

	#remove(list = objects())



	library(readr)
	library(stringr)
	library(dplyr)
	library(lubridate)
	library(ggplot2)
	library(scales)
	library(ggrepel)
	library(RcppRoll)


	#________________________________________________________
	#
	# Helper functions
	#
	#
	# Shadow shift
	# Courtesy Nicholas Tierney
	# https://github.com/njtierney/ggmissing/blob/master/R/shadow_shift.R

	shadow_shift <- function(x){

	xrange <- max(x, na.rm = TRUE) - min(x, na.rm = TRUE)

	xmin <- min(x, na.rm = TRUE)

	# create the "jitter" to be added around the points.
	xrunif <- (runif(length(x))-0.5)xrange0.05

	ifelse(is.na(x),
	# add the jitter around the those values that are missing
	yes = xmin-xrange*0.1 + xrunif,
	no = x)

	} # close function


	#

	Value_atMax <- function(vec_1, vec_2) {
	# get the value of vec_1 at the position of the maximum of vec_2
	# this handles the case where vec_2 is all missing
	if(all(is.na(vec_2))) return(NA)
	vec_1[which.max(vec_2)]
	}


	# Helper function to count missing
	N_missing <- function(x){ # sum number of missing values
	sum(is.na(x))
	}

	#__________________________________________________________________________________________

	# Download the hour data from
	# http://data.water.vic.gov.au/monitoring.htm

	# or a specific site
	# http://data.water.vic.gov.au/monitoring.htm?ppbm=222200&rs&1&rsdf_org

	# path to the file
	# my.path <- 'path to file'
	# my.file <- 'name of file
	fname <- str_c(my.path, my.file, sep = '/')

	# Read in the data

	# 1. Find where the data starts by searching for the line that begins with 'Datetime'


	heading.line <- which(str_detect(read_lines(fname, n_max = 50), 'Datetime')) # searching the first 50 lines
	if(!any(heading.line)) stop('Heading not found') # trap error if heading not found


	flow <- read_csv(fname, skip = heading.line - 1, n_max = Inf)

	# check if there are any problems reading in the data
	# Often, the final row is read in incorrectly

	my.problems <- problems(flow)
	n.problems <- nrow(my.problems)

	if(n.problems > 2) stop('Problems reading in flow file (not just the final row)')

	if(n.problems == 1) { # try to fix the problem, its likely the final row has been misread and can be removed
	if(my.problems$row == nrow(flow)) { # is there a problem with the final row
	flow <- flow[-nrow(flow), ] # remove final row
	} else {
	stop('Problems reading in flow file (single problem, not the final row)') # otherwise stop
	}
	}

	# name columns
	names(flow) <- c('date.time', 'flow.mean', 'flow.mean.qc', 'flow.min', 'flow.min.qc', 'flow.max', 'flow.max.qc')

	# parse dates

	flow <- flow %>%
	mutate(date.time = parse_date_time(date.time, orders = 'dmy H:M:S'))


	flow.annual <- flow %>%
	mutate(flow.max = flow.max) %>%
	mutate(flow.year = year(date.time)) %>%
	mutate(flow.month = month(date.time))%>%
	group_by(flow.year) %>%
	summarise(max.date = Value_atMax(date.time, flow.max),
	max.flow = max(flow.max, na.rm = FALSE),
	max.month = Value_atMax(flow.month, flow.max),
	n.days = n()/(24),
	n.missing = N_missing(flow.max),
	pc.missing = 100*n.missing/n())


	View(flow.annual)

	# flow data frame is now ready for further analysis
	# Flow example

	# 1. Plot annual peaks

	flow.annual %>%
	mutate(max.flow.shifted = shadow_shift(max.flow)) %>%
	ggplot(aes(x = flow.year, y = max.flow.shifted, colour = is.na(max.flow))) + geom_point() +
	xlab('Year') +
	# scale_y_continuous(bquote('Peak discharge (m' ^3 's' ^-1 ')'), labels = comma) +
	scale_y_continuous(name = 'Peak flow', labels = comma) +
	scale_colour_discrete(name = 'Missing') +
	theme_bw() +
	theme(
	panel.background = element_rect(fill="gray98"),
	axis.title.x = element_text(colour="grey20", size=14, margin=margin(20,0,0,0)),
	axis.text.x = element_text(colour="grey20",size=12),
	axis.title.y = element_text(colour="grey20",size=14, margin = margin(0,20,0,0)),
	axis.text.y = element_text(colour="grey20",size=14),
	legend.title = element_text(colour="grey20",size=12),
	plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left


	# Time between peaks
	# if this is too short the peaks are not independent

	flow.annual %>%
	mutate(time_diff =difftime(max.date, lag(max.date))) %>%
	arrange(time_diff) %>%
	View



	# 2. Time of year polar plot

	Date_to_Angle <- function(my.date){

	start.year <- floor_date(my.date, 'year')
	end.year <- ceiling_date(my.date, 'year')
	length.year <- as.numeric(difftime(end.year, start.year, units = 'sec'))
	time.since.year.start <- as.numeric(difftime(my.date, start.year, units = 'sec'))

	2 * pi * time.since.year.start/length.year # date as angle

	}

	# Angles for month start

	month.start <- yday(seq(as.Date('2010-01-01'),as.Date('2010-12-31'), by='1 months'))
	month.start.radians <- 2pimonth.start/365.25

	# Labels for the largest n points
	labeled.dat <- flow.annual %>%
	mutate(angle = Date_to_Angle(max.date)) %>%
	arrange(desc(max.flow)) %>%
	slice(1:5)



	flow.annual %>%
	mutate(angle = Date_to_Angle(max.date)) %>%
	ggplot(aes(x=angle, y=max.flow)) +
	coord_polar('x', start = 0, direction = 1) +
	geom_segment(aes(y = 0, xend = angle, yend = max.flow)) +
	scale_x_continuous(name = 'Floods - time of year', breaks = month.start.radians, labels = month.name, limits = c(0, 2*pi)) +
	scale_y_continuous(name = 'Peak flow', labels = comma) +
	#scale_y_continuous(name = bquote('Peak discharge (m' ^3 's' ^-1 ')')) +
	geom_text_repel(data = labeled.dat, aes(x = angle, y = max.flow + 100000, label = as.character(flow.year)), segment.size = 0) +
	theme_bw() +
	theme(
	#panel.background = element_rect(fill="gray99"),
	axis.title.x = element_text(colour="grey20", size=20, margin=margin(20,0,0,0)),
	axis.text.x = element_text(colour="grey20",size=10),
	axis.title.y = element_text(colour="grey20",size=20, margin = margin(0,20,0,0)),
	axis.text.y = element_text(colour="grey20",size=14),
	legend.title = element_text(colour="grey20",size=12),
	plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left



	# 3. Annual maximum three day volumes
	# Use a rolling 72 hour mean
	# Get max for each year

	flow %>%
	mutate(vol_3day = roll_mean(flow.mean, n = 72, fill = NA, na.rm = TRUE )) %>%
	mutate(flow.year = year(date.time)) %>%
	group_by(flow.year) %>%
	summarise(vol.max = 3 * max(vol_3day)) %>%
	View

	# when does the maximum volume occur

	flow %>%
	mutate(vol_3day = roll_mean(flow.mean, n = 72, fill = NA, na.rm = TRUE )) %>%
	mutate(flow.year = year(date.time)) %>%
	group_by(flow.year) %>%
	summarise(vol.max = 3 * max(vol_3day, na.rm = TRUE), vol.max.date = Value_atMax(date.time, vol_3day)) %>%
	View

	# 4. Empirical frequency curve

	# Cunnane plotting position is recommended in Australian Rainfall and Runoff

	pp_cunnane <- function(x){
	n = length(x)
	rank(x)/(n + 0.2)
	}

	my_AEP <- c(0.5, 1,2,5,10, 20, 50)
	my_labels <- str_c(my_AEP, '%')
	my_breaks <- 1 - AEP/100

	flow.annual %>%
	filter(!is.na(max.flow)) %>%
	mutate(AEP = pp_cunnane(max.flow)) %>%
	ggplot(aes(x = AEP, y = max.flow/86.4)) + geom_point() +
	scale_y_log10(name = 'Peak flow',
	breaks = c(1e+1, 1e+2, 1e+3, 1e+4),
	limits = c(10, 1e+4),
	labels = comma) +
	scale_x_continuous(trans = "probit", breaks = my_breaks , labels = my_labels )+
	theme_gray(base_size = 16)

	# 5. Plots for indiviual years or time periods

	# start_time <- ymd_hms('1971-02-06 17:00:00')

	flow %>%
	filter(date.time > ymd('1971-01-01', tz = 'UTC') & date.time < ymd('1971-02-20', tz = 'UTC') ) %>%
	mutate(flow.max.shifted = shadow_shift(flow.max)) %>%
	ggplot(aes(x = date.time, y = flow.max.shifted/86.4, colour = is.na(flow.max))) + geom_point() +
	xlab('Year') +
	# scale_y_continuous(bquote('Peak discharge (m' ^3 's' ^-1 ')'), labels = comma) +
	scale_y_continuous(name = 'Peak flow', labels = comma) +
	scale_colour_discrete(name = 'Missing') +
	theme_bw() +
	theme(
	panel.background = element_rect(fill="gray98"),
	axis.title.x = element_text(colour="grey20", size=14, margin=margin(20,0,0,0)),
	axis.text.x = element_text(colour="grey20",size=12),
	axis.title.y = element_text(colour="grey20",size=14, margin = margin(0,20,0,0)),
	axis.text.y = element_text(colour="grey20",size=14),
	legend.title = element_text(colour="grey20",size=12),
	plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left