Skip to content

Instantly share code, notes, and snippets.

@TonyLadson
Last active June 5, 2023 01:02
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TonyLadson/2c44ce82d4ccf3d12de35ea7752a6d87 to your computer and use it in GitHub Desktop.
Save TonyLadson/2c44ce82d4ccf3d12de35ea7752a6d87 to your computer and use it in GitHub Desktop.
Regional Flood Frequency Estimation (RFFE) comparison with at-site data. See the blog https://tonyladson.wordpress.com/2019/06/10/on-the-accuracy-of-the-rffe-in-nsw/
#Packages
library(tidyverse)
library(stringr)
library(dplyr)
library(readxl)
library(readr)
library(purrr)
library(ggrepel)
library(hydroGOF)
# Functions ---------------------------------------------------------------
# not in
"%ni%" <- function(x, y) x[!x %in% y]
# function to plot logarithmic minor breaks
# from https://stackoverflow.com/questions/30179442/plotting-minor-breaks-on-a-log-scale-with-ggplot
log_breaks = function(maj, radix=10) {
function(x) {
minx = floor(min(logb(x,radix), na.rm=T)) - 1
maxx = ceiling(max(logb(x,radix), na.rm=T)) + 1
n_major = maxx - minx + 1
major_breaks = seq(minx, maxx, by=1)
if (maj) {
breaks = major_breaks
} else {
steps = logb(1:(radix-1),radix)
breaks = rep(steps, times=n_major) +
rep(major_breaks, each=radix-1)
}
radix^breaks
}
}
scale_x_log_eng = function(..., radix=10) {
scale_x_continuous(...,
trans=scales::log_trans(radix),
breaks=log_breaks(TRUE, radix),
minor_breaks=log_breaks(FALSE, radix))
}
scale_y_log_eng = function(..., radix=10) {
scale_y_continuous(...,
trans=scales::log_trans(radix),
breaks=log_breaks(TRUE, radix),
minor_breaks=log_breaks(FALSE, radix))
}
# RFFE inputs ---------------------------------
# These have been obtained by scraping the RFFE
# Programatically visiting a large number of sites and
# downloading information on the particular site its its neighbours
# https://rffe.arr-software.org
# Data file RFFE-data-2_extra.csv is at:
#https://drive.google.com/file/d/100qEQlxsKNsEGcqdDHi8Fks5r0sPH-XR/view?usp=sharing
# setwd("~/Dropbox/R-repos/RFFE/R") # to get this working on my computer
fname <- file.path("../data","RFFE-data-2_extra.csv" )
rffe_inputs <- read_csv(fname, skip = 0)
glimpse(rffe_inputs)
# Check missing data
colSums(is.na(rffe_inputs))
rffe_inputs %>%
filter(is.na(statistics.mean)) %>%
select(region.name) %>%
count(region.name)
# Looks fine the missing values are associated with the arid zone gauges or the Pilbara
# Load information on RFFE gauges ---------------------------------
# From the Appendix A to the Stage 3 Project 5 report
# http://arr.ga.gov.au/__data/assets/pdf_file/0004/40468/ARR_Project-5_Stage-3_report.pdf
# Ataur Rahman, Khaled Haddad, Md Mahmudul Haque, George Kuczera, Erwin Weinmann (2015)
# Project 5 Regional Flood Methods: Stage 3
# Data is available at
#https://drive.google.com/file/d/1k-LkAaXSUvoGrXMew1-yHRQOAkYHRlc5/view?usp=sharing
fname <- file.path("../data","RFFE-Gauges.xlsx" )
rffe_gauges <- read_excel(fname, skip = 1)
glimpse(rffe_gauges)
rffe_gauges %>%
distinct(State)
# 1 NSW_ACT
# 2 Vic
# 3 SA
# 4 Tas
# 5 Qld
# 6 WA
# 7 NT
# 8 Arid
# We are only interested in NSW_ACT gauges. There are also probably
# some NSW gauges in the Arid region but
# We'll leave these out.
rffe_gauges <- rffe_gauges %>%
filter(State == "NSW_ACT") %>%
mutate(station_id = as.character(Station_ID)) # Consistent capitalisation and type to other data
glimpse(rffe_gauges)
#colSums(is.na(rffe_gauges))
#nothing missing
# Load NSW RFFE outputs ------------------------------------------------------
# Data is available from
# https://drive.google.com/file/d/16m2BderA9kOTk1Hrx1iGJh7_ZTBfp7d_/view?usp=sharing
fname <- file.path('../data', "RFFE_output_at_nsw_stations.csv" )
nsw_rffe_gauges_out <- read_csv(fname,
col_types = cols(.default = col_double(),
station_id = col_character(), # use character for consistency with other data
station_name = col_character()
))
glimpse(nsw_rffe_gauges_out)
#colSums(is.na(nsw_rffe_gauges_out))
# Nothing missing
# WMA data ----------------------------------------------------------------
# At site flood frequency analysis results for ~160 NSW gauges
# Many of which coincide with the gauges used in the RFFE
# Reference
# Scott Podger, Mark Babister, Aaron Trim, Monique Retallick, Melissa Adam (2019)
# Review of ARR design inputs for NSW
# WMA water for office of environment and heritage
# These data come from Table C1 in Appendix C to the report
# Data is availalble from
# https://drive.google.com/file/d/15UoiuLjdgdySpo-4mJfvsuXlTBXj4Gl-/view?usp=sharing
fname <- file.path('../data','TableC1.xlsx' )
nsw_atsite_FFA <- read_xlsx(fname, skip = 5)
glimpse(nsw_atsite_FFA)
nsw_atsite_FFA <- nsw_atsite_FFA %>% # rename some columns so they are easier to work with
rename(FFA_AEP_50 = `50% AEP`,
FFA_AEP_20 = `20% AEP`,
FFA_AEP_10 = `10% AEP`,
FFA_AEP_5 = `5% AEP`,
FFA_AEP_2 = `2% AEP`,
FFA_AEP_1 = `1% AEP`) %>%
mutate(station_id = as.character(station_id)) # change to character for consistency with other data
# Checking missing
colSums(is.na(nsw_atsite_FFA))
# 4 missing values
nsw_atsite_FFA %>%
filter(is.na(station))
# station station_id station_name lon lat record_length ams_num FFA_AEP_50 FFA_AEP_20 FFA_AEP_10
# <chr> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 NA NA Wollondilly… 150.25 -34.222 57 57 473 1606 2475
# 2 NA NA Nattai Case… 150.62 -34.222 42 42 88 310 492
# 3 NA NA Nepean Rive… 150.42 -34.136 43 43 273 983 1525
# 4 NA NA Cox River a… 150.25 -33.861 56 56 184 779 1279
# These extra sites were added by WMA water they are not included in the project 5 sites
# That is, they can be ignored
# Join -----------------------------------------------------------------
# Join the FFA estimates to the RFFE estimates
glimpse(nsw_rffe_gauges_out)
glimpse(nsw_atsite_FFA)
# join by station_id
nsw_comp <- nsw_rffe_gauges_out %>%
left_join(nsw_atsite_FFA, by = 'station_id')
# Check the names are the same in both data sets
nsw_comp %>%
filter(station_name.x != station_name.y)
# the names are all the same so remove one and rename the other
nsw_comp <- nsw_comp %>%
select(-station_name.y) %>%
rename(station_name = station_name.x)
# join information on the RFFE gauges
glimpse(rffe_gauges)
nsw_comp <- nsw_comp %>%
left_join(rffe_gauges, by = "station_id")
glimpse(nsw_comp)
# check that the station names are the same
# along with the lats and lons
options(pillar.sigfig = 5) # Would like to see more digits for the comparison
nsw_comp %>%
filter(station_name != Station_Name) %>%
select(station_id, station_name, Station_Name, Lat, lat, Lon, lon, Area) # the capitalised and
# non-capitalised values come from different files
# # A tibble: 12 x 8
# station_id station_name Station_Name Lat lat Lon lon Area
# <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 219017 near Brogo Near Brogo -36.6 -36.598 149.81 149.81 152
# 2 219015 near Bermagui Near Bermagui -36.43 -36.431 150.01 150.00 31
# 3 219001 Brown Mt Brown Mountain -36.6 -36.597 149.44 149.44 15
# 4 216004 Falls Creek Falls Ck -34.97 -34.968 150.6 150.60 95
# 5 410156 Book Book Book -35.35 -35.353 147.55 147.55 145
# 6 410038 Butmaroo Darbalara -35.02 -35.02 148.25 148.25 411
# 7 208026 Forbesdale (Causeway) Jacky Barkers -31.64 -32.038 151.74 151.87 560
# 8 210044 Middle Falbrook (Fal Dam sight) Middle Falbrook(Fal Dam Si -32.45 -32.45 151.15 151.15 466
# 9 211010 U/S Wyongh R (Durren La) U/S Wyong R (Durren La) -33.25 -33.244 151.39 151.39 92
# 10 204056 Gibrattar Range Gibraltar Range -29.49 -29.49 152.45 152.45 104
# 11 204036 Sandy Hill(below Snake Creek) Sandy Hill(below Snake Cre -28.93 -28.93 152.22 152.22 236
# 12 203014 Etham Eltham -28.76 -28.756 153.4 153.40 223
# >
# These all look like the same sites in each file, apart from possibily 208026
# Comparison 1: Compare the RFFE output with the FFA results -----------------------------------------------------------------
glimpse(nsw_comp)
# nash-sutcliffe
hydroGOF::NSE(sim = nsw_comp$flow_1pc, obs = nsw_comp$FFA_AEP_1)
#[1] -1.291951
hydroGOF::rmse(sim = nsw_comp$flow_1pc, obs = nsw_comp$FFA_AEP_1)
#[1] 1459.073
hydroGOF::NSE(sim = log10(nsw_comp$flow_1pc), obs = log10(nsw_comp$FFA_AEP_1))
# [1] 0.3914661
# Bias
nsw_comp %>%
summarise(mean(FFA_AEP_1 - flow_1pc))
nsw_comp %>%
ggplot(aes(flow_1pc, FFA_AEP_1)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_smooth(colour = 'blue', se = FALSE) +
scale_y_continuous(name = "1% AEP using At-site FFA",
label = scales::comma,
limits = c(0,9000),
breaks = seq(0,9000,1000)) +
scale_x_continuous(name = "1% AEP estimate from the RFFE",
label = scales::comma,
limits = c(0,9000),
breaks = seq(0,9000,1000))
# Make a web version of the figure
p <- nsw_comp %>%
ggplot(aes(x = flow_1pc, y = FFA_AEP_1)) +
geom_point(size = 0.2) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed', size = 0.1) +
geom_smooth(colour = 'blue', se = FALSE, size = 0.1) +
scale_y_continuous(name = bquote('1% AEP from at-site FFA (m' ^3 *'s' ^-1 *')'),
label = scales::comma,
limits = c(0,9000),
breaks = seq(0,9000,1000)) +
scale_x_continuous(name = bquote("1% AEP estimate from the RFFE (m" ^3 *'s' ^-1 *')'),
label = scales::comma,
limits = c(0,9000),
breaks = seq(0,9000,1000)) +
theme_grey(base_size = 5)
p
ggsave(file.path('../figures', 'FFA-RFFEout.png'), p, width = 4, height = 3)
# Log transformed version
p <- nsw_comp %>%
ggplot(aes( x = flow_1pc, y = FFA_AEP_1)) +
geom_point(size = 0.2) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed', size = 0.1) +
geom_smooth(colour = 'blue', se = FALSE, size = 0.1) +
scale_y_log_eng(name = bquote("1% AEP from at-site FFA (m" ^3 *'s' ^-1 *')'),
limits = c(NA, 10000)) +
scale_x_log_eng(name = bquote("1% AEP estimate from the RFFE (m" ^3 *'s' ^-1 *')')) +
theme_grey(base_size = 5)
p
ggsave(file.path('../figures', 'FFA-RFFEout_log.png'), p, width = 4, height = 3)
# Mean Difference Plot
# Alternative analysis based on ratios
# Following Bland and Altman 1999
# The correct way to calculate the ratio is y/yhat
# In this case y = the at-site values (FFA)
# yhat are the RFFE values
# Residuals are usually y - yhat
# logy - logyhat = log(y/yhat)
# # Add logs of 1% values
# nsw_comp_log <- nsw_comp %>%
# mutate(FFA_AEP_1_log = log10(FFA_AEP_1),
# flow_1pc_log = log10(flow_1pc),
# diff_log = FFA_AEP_1_log - flow_1pc_log,
# mean_log = 0.5 * (FFA_AEP_1_log + FFA_AEP_1_log))
# Add ratios to dataframe
nsw_comp_ratio <- nsw_comp %>%
mutate(#ratio_1pc = flow_1pc/FFA_AEP_1,
ratio_1pc = FFA_AEP_1/flow_1pc, # use the ratio y/yhat
mean_1pc = 0.5*(flow_1pc + FFA_AEP_1))
# Calculate statistics of ratios
ratio_stats_log <-
nsw_comp_ratio %>%
mutate(ratio_1pc_log = log10(ratio_1pc)) %>%
summarise(mean_difference_log = mean(ratio_1pc_log),
sd_diff_log = sd(ratio_1pc_log)) %>%
unlist()
ratio_stats_log
# mean_difference_log sd_diff_log
# -0.05677213 0.35657627
10^ratio_stats_log["mean_difference_log"]
# mean_difference_log
# 0.8774611
ULA <- as.vector(ratio_stats_log["mean_difference_log"] + 2* ratio_stats_log["sd_diff_log"]) # Upper Limit
ULA
# [1] 0.6563804
10^ULA
#[1] 4.532945
LLA <- as.vector(ratio_stats_log["mean_difference_log"] - 2* ratio_stats_log["sd_diff_log"]) # Upper Limit
LLA
#[1] -0.7699247
10^LLA
# [1] 0.1698538
# These are the same answers as calculated when log-tranforming the differences.
# make a data frame for labels
nsw_comp_ratio_labels <- nsw_comp_ratio %>%
filter(ratio_1pc > 4.53)
# Add ratios to dataframe and plot
nsw_comp_ratio %>%
ggplot(aes(x = mean_1pc, y = ratio_1pc)) +
geom_point() +
scale_y_log_eng(name = 'Ratio (FFA/RFFE)',
limits = c(0.1, 100)) +
scale_x_continuous(name = 'Average of AEP 1% estimates') +
geom_hline(yintercept = 10^LLA, linetype = 'dashed') + # Upper confidence interval
geom_hline(yintercept = 10^ULA, linetype = 'dashed') + # Lower confidence interval
geom_hline(yintercept = 10^ratio_stats_log["mean_difference_log"], linetype = 'dashed') + # mean
geom_text(data = nsw_comp_ratio_labels,
aes(x = mean_1pc, y = ratio_1pc, label = station_id),
nudge_x = -40,
hjust = 'right') +
annotate(geom = "text", x = 4500, y = 10^ULA + 1, label = sprintf("mean + 2SD = %.2f", 10^ULA), hjust = 'left') +
annotate(geom = "text", x = 4500, y = 10^LLA - 0.02, label = sprintf("mean - 2SD = %.2f", 10^LLA), hjust = 'left') +
annotate(geom = "text", x = 4500,
y = 10^ratio_stats_log['mean_difference_log'] + 0.2,
label = sprintf("bias = %.2f", 10^ratio_stats_log['mean_difference_log']),
hjust = 'left')
# Make a web version
# Everything needs to be scaled down
my_size = 0.1
my_text_size = 2
p <- nsw_comp_ratio %>%
ggplot(aes(x = mean_1pc, y = ratio_1pc)) +
geom_point(size = my_size) +
scale_y_log_eng(name = 'Ratio (FFA/RFFE)',
limits = c(0.1, 100)) +
scale_x_continuous(name = 'Average of AEP 1% estimates (cumec)', label = scales::comma) +
geom_hline(yintercept = 10^LLA, linetype = 'dashed', size = my_size) + # Upper confidence interval
geom_hline(yintercept = 10^ULA, linetype = 'dashed', size = my_size) + # Lower confidence interval
geom_hline(yintercept = 10^ratio_stats_log["mean_difference_log"], linetype = 'dashed', size = my_size) + # mean
geom_text(data = nsw_comp_ratio_labels,
aes(x = mean_1pc, y = ratio_1pc, label = station_id),
nudge_x = -50,
hjust = 'right',
size = my_text_size) +
annotate(geom = "text", x = 4500,
y = 10^ULA + 1,
label = sprintf("mean + 2SD = %.2f", 10^ULA),
hjust = 'left',
size = my_text_size) +
annotate(geom = "text", x = 4500,
y = 10^LLA - 0.02,
label = sprintf("mean - 2SD = %.2f", 10^LLA),
hjust = 'left',
size = my_text_size) +
annotate(geom = "text", x = 4500,
y = 10^ratio_stats_log['mean_difference_log'] + 0.2,
label = sprintf("bias = %.2f", 10^ratio_stats_log['mean_difference_log']),
hjust = 'left',
size = my_text_size) +
theme_grey(base_size = 7)
p
ggsave(file.path('../figures', 'FFA-RFFE_ratio.png'), p, width = 4, height = 3)
# Checking outliers
# Looking at BoM http://www.bom.gov.au/waterdata/
# 212040 KIALLA CREEK AT POMEROY -34.61 149.54 1%AEP = 110 cumec(GEV)
# 210084 GLENNIES CREEK AT THE ROCKS NO.2 -32.36 151.24 1%AEP = 700-1500 cumec(GEV)
# 210084 is d/s of Glennies Ck Dam
# Checking on the Glennies Ck Site
nsw_comp %>%
filter(station_id == 210084) %>%
select(station_name, flow_1pc, FFA_AEP_1, Area, Lat, lat, Lon, lon)
# # A tibble: 1 x 8
# station_name flow_1pc FFA_AEP_1 Area Lat lat Lon lon
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 The Rocks No.2 130 5985 227 -32.370 -32.365 151.24 151.24
# Issues is that flow_1pc is << FFA_AEP_1 i.e. the 1% flow returned by
# the RFFE tool is much less than the 1% At-site values calculated by WMA water
# Check the RFFE input for 210084
rffe_inputs %>%
filter(station_id == '210084')
# # A tibble: 1 x 42
# station_id area record.length latc lonc lato lono bcf i2_6h i50_6h mar shape.factor flow_1pc flow_2pc flow_5pc
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 210084 227 38 -32.270 151.29 -32.365 151.24 -2.181 7.533 15.633 1034 0.7841 4539.6 1573.2 381.38
# the flow_1pc is 4539.6 which is close to the FFA results but
# putting the outlet and centroids back into the RFFE results in a very low output.
# It look like this is because the bias correction is so large
# 212040 KIALLA CREEK AT POMEROY
nsw_comp %>%
filter(station_id == 212040) %>%
select(station_name, flow_1pc, FFA_AEP_1, Area, Lat, lat, Lon, lon)
# # A tibble: 1 x 8
# station_name flow_1pc FFA_AEP_1 Area Lat lat Lon lon
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Pomeroy 138 1819 96 -34.61 -34.61 149.54 149.54
rffe_inputs %>%
filter(station_id == '212040')
# # A tibble: 1 x 42
# station_id area record.length latc lonc lato lono bcf i2_6h i50_6h mar shape.factor flow_1pc flow_2pc flow_5pc flow_10pc
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 212040 96 32 -34.560 149.50 -34.61 149.54 -0.963 5.35 10.6 703.7 0.6906 767.83 513.16 274.83 154.62
# #
# Not obvious what the problem is with 212040 but the output 1% AEP value is much less than
# the At site values. The input values for this site is also smaller than the At-site FFA value.
# How much do we change the limits of agreement if we remove the outliers
ratio_stats_log <-
nsw_comp_ratio %>%
filter(! station_id %in% c(210084, 212040)) %>%
mutate(ratio_1pc_log = log10(ratio_1pc)) %>%
summarise(mean_difference_log = mean(ratio_1pc_log),
sd_diff_log = sd(ratio_1pc_log)) %>%
unlist()
ratio_stats_log
# mean_difference_log sd_diff_log
# -0.07510516 0.31743895
10^ratio_stats_log["mean_difference_log"]
# mean_difference_log
# 0.8411914
ULA <- as.vector(ratio_stats_log["mean_difference_log"] + 2* ratio_stats_log["sd_diff_log"]) # Upper Limit
ULA
# [1] 0.5597728
10^ULA
#[1] [1] 3.628881
LLA <- as.vector(ratio_stats_log["mean_difference_log"] - 2* ratio_stats_log["sd_diff_log"]) # Upper Limit
LLA
#[1] -0.7099831
10^LLA
# [1] 0.1949921
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment