Last active
June 5, 2023 01:02
-
-
Save TonyLadson/2c44ce82d4ccf3d12de35ea7752a6d87 to your computer and use it in GitHub Desktop.
Regional Flood Frequency Estimation (RFFE) comparison with at-site data. See the blog https://tonyladson.wordpress.com/2019/06/10/on-the-accuracy-of-the-rffe-in-nsw/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Packages | |
library(tidyverse) | |
library(stringr) | |
library(dplyr) | |
library(readxl) | |
library(readr) | |
library(purrr) | |
library(ggrepel) | |
library(hydroGOF) | |
# Functions --------------------------------------------------------------- | |
# not in | |
"%ni%" <- function(x, y) x[!x %in% y] | |
# function to plot logarithmic minor breaks | |
# from https://stackoverflow.com/questions/30179442/plotting-minor-breaks-on-a-log-scale-with-ggplot | |
log_breaks = function(maj, radix=10) { | |
function(x) { | |
minx = floor(min(logb(x,radix), na.rm=T)) - 1 | |
maxx = ceiling(max(logb(x,radix), na.rm=T)) + 1 | |
n_major = maxx - minx + 1 | |
major_breaks = seq(minx, maxx, by=1) | |
if (maj) { | |
breaks = major_breaks | |
} else { | |
steps = logb(1:(radix-1),radix) | |
breaks = rep(steps, times=n_major) + | |
rep(major_breaks, each=radix-1) | |
} | |
radix^breaks | |
} | |
} | |
scale_x_log_eng = function(..., radix=10) { | |
scale_x_continuous(..., | |
trans=scales::log_trans(radix), | |
breaks=log_breaks(TRUE, radix), | |
minor_breaks=log_breaks(FALSE, radix)) | |
} | |
scale_y_log_eng = function(..., radix=10) { | |
scale_y_continuous(..., | |
trans=scales::log_trans(radix), | |
breaks=log_breaks(TRUE, radix), | |
minor_breaks=log_breaks(FALSE, radix)) | |
} | |
# RFFE inputs --------------------------------- | |
# These have been obtained by scraping the RFFE | |
# Programatically visiting a large number of sites and | |
# downloading information on the particular site its its neighbours | |
# https://rffe.arr-software.org | |
# Data file RFFE-data-2_extra.csv is at: | |
#https://drive.google.com/file/d/100qEQlxsKNsEGcqdDHi8Fks5r0sPH-XR/view?usp=sharing | |
# setwd("~/Dropbox/R-repos/RFFE/R") # to get this working on my computer | |
fname <- file.path("../data","RFFE-data-2_extra.csv" ) | |
rffe_inputs <- read_csv(fname, skip = 0) | |
glimpse(rffe_inputs) | |
# Check missing data | |
colSums(is.na(rffe_inputs)) | |
rffe_inputs %>% | |
filter(is.na(statistics.mean)) %>% | |
select(region.name) %>% | |
count(region.name) | |
# Looks fine the missing values are associated with the arid zone gauges or the Pilbara | |
# Load information on RFFE gauges --------------------------------- | |
# From the Appendix A to the Stage 3 Project 5 report | |
# http://arr.ga.gov.au/__data/assets/pdf_file/0004/40468/ARR_Project-5_Stage-3_report.pdf | |
# Ataur Rahman, Khaled Haddad, Md Mahmudul Haque, George Kuczera, Erwin Weinmann (2015) | |
# Project 5 Regional Flood Methods: Stage 3 | |
# Data is available at | |
#https://drive.google.com/file/d/1k-LkAaXSUvoGrXMew1-yHRQOAkYHRlc5/view?usp=sharing | |
fname <- file.path("../data","RFFE-Gauges.xlsx" ) | |
rffe_gauges <- read_excel(fname, skip = 1) | |
glimpse(rffe_gauges) | |
rffe_gauges %>% | |
distinct(State) | |
# 1 NSW_ACT | |
# 2 Vic | |
# 3 SA | |
# 4 Tas | |
# 5 Qld | |
# 6 WA | |
# 7 NT | |
# 8 Arid | |
# We are only interested in NSW_ACT gauges. There are also probably | |
# some NSW gauges in the Arid region but | |
# We'll leave these out. | |
rffe_gauges <- rffe_gauges %>% | |
filter(State == "NSW_ACT") %>% | |
mutate(station_id = as.character(Station_ID)) # Consistent capitalisation and type to other data | |
glimpse(rffe_gauges) | |
#colSums(is.na(rffe_gauges)) | |
#nothing missing | |
# Load NSW RFFE outputs ------------------------------------------------------ | |
# Data is available from | |
# https://drive.google.com/file/d/16m2BderA9kOTk1Hrx1iGJh7_ZTBfp7d_/view?usp=sharing | |
fname <- file.path('../data', "RFFE_output_at_nsw_stations.csv" ) | |
nsw_rffe_gauges_out <- read_csv(fname, | |
col_types = cols(.default = col_double(), | |
station_id = col_character(), # use character for consistency with other data | |
station_name = col_character() | |
)) | |
glimpse(nsw_rffe_gauges_out) | |
#colSums(is.na(nsw_rffe_gauges_out)) | |
# Nothing missing | |
# WMA data ---------------------------------------------------------------- | |
# At site flood frequency analysis results for ~160 NSW gauges | |
# Many of which coincide with the gauges used in the RFFE | |
# Reference | |
# Scott Podger, Mark Babister, Aaron Trim, Monique Retallick, Melissa Adam (2019) | |
# Review of ARR design inputs for NSW | |
# WMA water for office of environment and heritage | |
# These data come from Table C1 in Appendix C to the report | |
# Data is availalble from | |
# https://drive.google.com/file/d/15UoiuLjdgdySpo-4mJfvsuXlTBXj4Gl-/view?usp=sharing | |
fname <- file.path('../data','TableC1.xlsx' ) | |
nsw_atsite_FFA <- read_xlsx(fname, skip = 5) | |
glimpse(nsw_atsite_FFA) | |
nsw_atsite_FFA <- nsw_atsite_FFA %>% # rename some columns so they are easier to work with | |
rename(FFA_AEP_50 = `50% AEP`, | |
FFA_AEP_20 = `20% AEP`, | |
FFA_AEP_10 = `10% AEP`, | |
FFA_AEP_5 = `5% AEP`, | |
FFA_AEP_2 = `2% AEP`, | |
FFA_AEP_1 = `1% AEP`) %>% | |
mutate(station_id = as.character(station_id)) # change to character for consistency with other data | |
# Checking missing | |
colSums(is.na(nsw_atsite_FFA)) | |
# 4 missing values | |
nsw_atsite_FFA %>% | |
filter(is.na(station)) | |
# station station_id station_name lon lat record_length ams_num FFA_AEP_50 FFA_AEP_20 FFA_AEP_10 | |
# <chr> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> | |
# 1 NA NA Wollondilly… 150.25 -34.222 57 57 473 1606 2475 | |
# 2 NA NA Nattai Case… 150.62 -34.222 42 42 88 310 492 | |
# 3 NA NA Nepean Rive… 150.42 -34.136 43 43 273 983 1525 | |
# 4 NA NA Cox River a… 150.25 -33.861 56 56 184 779 1279 | |
# These extra sites were added by WMA water they are not included in the project 5 sites | |
# That is, they can be ignored | |
# Join ----------------------------------------------------------------- | |
# Join the FFA estimates to the RFFE estimates | |
glimpse(nsw_rffe_gauges_out) | |
glimpse(nsw_atsite_FFA) | |
# join by station_id | |
nsw_comp <- nsw_rffe_gauges_out %>% | |
left_join(nsw_atsite_FFA, by = 'station_id') | |
# Check the names are the same in both data sets | |
nsw_comp %>% | |
filter(station_name.x != station_name.y) | |
# the names are all the same so remove one and rename the other | |
nsw_comp <- nsw_comp %>% | |
select(-station_name.y) %>% | |
rename(station_name = station_name.x) | |
# join information on the RFFE gauges | |
glimpse(rffe_gauges) | |
nsw_comp <- nsw_comp %>% | |
left_join(rffe_gauges, by = "station_id") | |
glimpse(nsw_comp) | |
# check that the station names are the same | |
# along with the lats and lons | |
options(pillar.sigfig = 5) # Would like to see more digits for the comparison | |
nsw_comp %>% | |
filter(station_name != Station_Name) %>% | |
select(station_id, station_name, Station_Name, Lat, lat, Lon, lon, Area) # the capitalised and | |
# non-capitalised values come from different files | |
# # A tibble: 12 x 8 | |
# station_id station_name Station_Name Lat lat Lon lon Area | |
# <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> | |
# 1 219017 near Brogo Near Brogo -36.6 -36.598 149.81 149.81 152 | |
# 2 219015 near Bermagui Near Bermagui -36.43 -36.431 150.01 150.00 31 | |
# 3 219001 Brown Mt Brown Mountain -36.6 -36.597 149.44 149.44 15 | |
# 4 216004 Falls Creek Falls Ck -34.97 -34.968 150.6 150.60 95 | |
# 5 410156 Book Book Book -35.35 -35.353 147.55 147.55 145 | |
# 6 410038 Butmaroo Darbalara -35.02 -35.02 148.25 148.25 411 | |
# 7 208026 Forbesdale (Causeway) Jacky Barkers -31.64 -32.038 151.74 151.87 560 | |
# 8 210044 Middle Falbrook (Fal Dam sight) Middle Falbrook(Fal Dam Si -32.45 -32.45 151.15 151.15 466 | |
# 9 211010 U/S Wyongh R (Durren La) U/S Wyong R (Durren La) -33.25 -33.244 151.39 151.39 92 | |
# 10 204056 Gibrattar Range Gibraltar Range -29.49 -29.49 152.45 152.45 104 | |
# 11 204036 Sandy Hill(below Snake Creek) Sandy Hill(below Snake Cre -28.93 -28.93 152.22 152.22 236 | |
# 12 203014 Etham Eltham -28.76 -28.756 153.4 153.40 223 | |
# > | |
# These all look like the same sites in each file, apart from possibily 208026 | |
# Comparison 1: Compare the RFFE output with the FFA results ----------------------------------------------------------------- | |
glimpse(nsw_comp) | |
# nash-sutcliffe | |
hydroGOF::NSE(sim = nsw_comp$flow_1pc, obs = nsw_comp$FFA_AEP_1) | |
#[1] -1.291951 | |
hydroGOF::rmse(sim = nsw_comp$flow_1pc, obs = nsw_comp$FFA_AEP_1) | |
#[1] 1459.073 | |
hydroGOF::NSE(sim = log10(nsw_comp$flow_1pc), obs = log10(nsw_comp$FFA_AEP_1)) | |
# [1] 0.3914661 | |
# Bias | |
nsw_comp %>% | |
summarise(mean(FFA_AEP_1 - flow_1pc)) | |
nsw_comp %>% | |
ggplot(aes(flow_1pc, FFA_AEP_1)) + | |
geom_point() + | |
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + | |
geom_smooth(colour = 'blue', se = FALSE) + | |
scale_y_continuous(name = "1% AEP using At-site FFA", | |
label = scales::comma, | |
limits = c(0,9000), | |
breaks = seq(0,9000,1000)) + | |
scale_x_continuous(name = "1% AEP estimate from the RFFE", | |
label = scales::comma, | |
limits = c(0,9000), | |
breaks = seq(0,9000,1000)) | |
# Make a web version of the figure | |
p <- nsw_comp %>% | |
ggplot(aes(x = flow_1pc, y = FFA_AEP_1)) + | |
geom_point(size = 0.2) + | |
geom_abline(slope = 1, intercept = 0, linetype = 'dashed', size = 0.1) + | |
geom_smooth(colour = 'blue', se = FALSE, size = 0.1) + | |
scale_y_continuous(name = bquote('1% AEP from at-site FFA (m' ^3 *'s' ^-1 *')'), | |
label = scales::comma, | |
limits = c(0,9000), | |
breaks = seq(0,9000,1000)) + | |
scale_x_continuous(name = bquote("1% AEP estimate from the RFFE (m" ^3 *'s' ^-1 *')'), | |
label = scales::comma, | |
limits = c(0,9000), | |
breaks = seq(0,9000,1000)) + | |
theme_grey(base_size = 5) | |
p | |
ggsave(file.path('../figures', 'FFA-RFFEout.png'), p, width = 4, height = 3) | |
# Log transformed version | |
p <- nsw_comp %>% | |
ggplot(aes( x = flow_1pc, y = FFA_AEP_1)) + | |
geom_point(size = 0.2) + | |
geom_abline(slope = 1, intercept = 0, linetype = 'dashed', size = 0.1) + | |
geom_smooth(colour = 'blue', se = FALSE, size = 0.1) + | |
scale_y_log_eng(name = bquote("1% AEP from at-site FFA (m" ^3 *'s' ^-1 *')'), | |
limits = c(NA, 10000)) + | |
scale_x_log_eng(name = bquote("1% AEP estimate from the RFFE (m" ^3 *'s' ^-1 *')')) + | |
theme_grey(base_size = 5) | |
p | |
ggsave(file.path('../figures', 'FFA-RFFEout_log.png'), p, width = 4, height = 3) | |
# Mean Difference Plot | |
# Alternative analysis based on ratios | |
# Following Bland and Altman 1999 | |
# The correct way to calculate the ratio is y/yhat | |
# In this case y = the at-site values (FFA) | |
# yhat are the RFFE values | |
# Residuals are usually y - yhat | |
# logy - logyhat = log(y/yhat) | |
# # Add logs of 1% values | |
# nsw_comp_log <- nsw_comp %>% | |
# mutate(FFA_AEP_1_log = log10(FFA_AEP_1), | |
# flow_1pc_log = log10(flow_1pc), | |
# diff_log = FFA_AEP_1_log - flow_1pc_log, | |
# mean_log = 0.5 * (FFA_AEP_1_log + FFA_AEP_1_log)) | |
# Add ratios to dataframe | |
nsw_comp_ratio <- nsw_comp %>% | |
mutate(#ratio_1pc = flow_1pc/FFA_AEP_1, | |
ratio_1pc = FFA_AEP_1/flow_1pc, # use the ratio y/yhat | |
mean_1pc = 0.5*(flow_1pc + FFA_AEP_1)) | |
# Calculate statistics of ratios | |
ratio_stats_log <- | |
nsw_comp_ratio %>% | |
mutate(ratio_1pc_log = log10(ratio_1pc)) %>% | |
summarise(mean_difference_log = mean(ratio_1pc_log), | |
sd_diff_log = sd(ratio_1pc_log)) %>% | |
unlist() | |
ratio_stats_log | |
# mean_difference_log sd_diff_log | |
# -0.05677213 0.35657627 | |
10^ratio_stats_log["mean_difference_log"] | |
# mean_difference_log | |
# 0.8774611 | |
ULA <- as.vector(ratio_stats_log["mean_difference_log"] + 2* ratio_stats_log["sd_diff_log"]) # Upper Limit | |
ULA | |
# [1] 0.6563804 | |
10^ULA | |
#[1] 4.532945 | |
LLA <- as.vector(ratio_stats_log["mean_difference_log"] - 2* ratio_stats_log["sd_diff_log"]) # Upper Limit | |
LLA | |
#[1] -0.7699247 | |
10^LLA | |
# [1] 0.1698538 | |
# These are the same answers as calculated when log-tranforming the differences. | |
# make a data frame for labels | |
nsw_comp_ratio_labels <- nsw_comp_ratio %>% | |
filter(ratio_1pc > 4.53) | |
# Add ratios to dataframe and plot | |
nsw_comp_ratio %>% | |
ggplot(aes(x = mean_1pc, y = ratio_1pc)) + | |
geom_point() + | |
scale_y_log_eng(name = 'Ratio (FFA/RFFE)', | |
limits = c(0.1, 100)) + | |
scale_x_continuous(name = 'Average of AEP 1% estimates') + | |
geom_hline(yintercept = 10^LLA, linetype = 'dashed') + # Upper confidence interval | |
geom_hline(yintercept = 10^ULA, linetype = 'dashed') + # Lower confidence interval | |
geom_hline(yintercept = 10^ratio_stats_log["mean_difference_log"], linetype = 'dashed') + # mean | |
geom_text(data = nsw_comp_ratio_labels, | |
aes(x = mean_1pc, y = ratio_1pc, label = station_id), | |
nudge_x = -40, | |
hjust = 'right') + | |
annotate(geom = "text", x = 4500, y = 10^ULA + 1, label = sprintf("mean + 2SD = %.2f", 10^ULA), hjust = 'left') + | |
annotate(geom = "text", x = 4500, y = 10^LLA - 0.02, label = sprintf("mean - 2SD = %.2f", 10^LLA), hjust = 'left') + | |
annotate(geom = "text", x = 4500, | |
y = 10^ratio_stats_log['mean_difference_log'] + 0.2, | |
label = sprintf("bias = %.2f", 10^ratio_stats_log['mean_difference_log']), | |
hjust = 'left') | |
# Make a web version | |
# Everything needs to be scaled down | |
my_size = 0.1 | |
my_text_size = 2 | |
p <- nsw_comp_ratio %>% | |
ggplot(aes(x = mean_1pc, y = ratio_1pc)) + | |
geom_point(size = my_size) + | |
scale_y_log_eng(name = 'Ratio (FFA/RFFE)', | |
limits = c(0.1, 100)) + | |
scale_x_continuous(name = 'Average of AEP 1% estimates (cumec)', label = scales::comma) + | |
geom_hline(yintercept = 10^LLA, linetype = 'dashed', size = my_size) + # Upper confidence interval | |
geom_hline(yintercept = 10^ULA, linetype = 'dashed', size = my_size) + # Lower confidence interval | |
geom_hline(yintercept = 10^ratio_stats_log["mean_difference_log"], linetype = 'dashed', size = my_size) + # mean | |
geom_text(data = nsw_comp_ratio_labels, | |
aes(x = mean_1pc, y = ratio_1pc, label = station_id), | |
nudge_x = -50, | |
hjust = 'right', | |
size = my_text_size) + | |
annotate(geom = "text", x = 4500, | |
y = 10^ULA + 1, | |
label = sprintf("mean + 2SD = %.2f", 10^ULA), | |
hjust = 'left', | |
size = my_text_size) + | |
annotate(geom = "text", x = 4500, | |
y = 10^LLA - 0.02, | |
label = sprintf("mean - 2SD = %.2f", 10^LLA), | |
hjust = 'left', | |
size = my_text_size) + | |
annotate(geom = "text", x = 4500, | |
y = 10^ratio_stats_log['mean_difference_log'] + 0.2, | |
label = sprintf("bias = %.2f", 10^ratio_stats_log['mean_difference_log']), | |
hjust = 'left', | |
size = my_text_size) + | |
theme_grey(base_size = 7) | |
p | |
ggsave(file.path('../figures', 'FFA-RFFE_ratio.png'), p, width = 4, height = 3) | |
# Checking outliers | |
# Looking at BoM http://www.bom.gov.au/waterdata/ | |
# 212040 KIALLA CREEK AT POMEROY -34.61 149.54 1%AEP = 110 cumec(GEV) | |
# 210084 GLENNIES CREEK AT THE ROCKS NO.2 -32.36 151.24 1%AEP = 700-1500 cumec(GEV) | |
# 210084 is d/s of Glennies Ck Dam | |
# Checking on the Glennies Ck Site | |
nsw_comp %>% | |
filter(station_id == 210084) %>% | |
select(station_name, flow_1pc, FFA_AEP_1, Area, Lat, lat, Lon, lon) | |
# # A tibble: 1 x 8 | |
# station_name flow_1pc FFA_AEP_1 Area Lat lat Lon lon | |
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> | |
# 1 The Rocks No.2 130 5985 227 -32.370 -32.365 151.24 151.24 | |
# Issues is that flow_1pc is << FFA_AEP_1 i.e. the 1% flow returned by | |
# the RFFE tool is much less than the 1% At-site values calculated by WMA water | |
# Check the RFFE input for 210084 | |
rffe_inputs %>% | |
filter(station_id == '210084') | |
# # A tibble: 1 x 42 | |
# station_id area record.length latc lonc lato lono bcf i2_6h i50_6h mar shape.factor flow_1pc flow_2pc flow_5pc | |
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> | |
# 1 210084 227 38 -32.270 151.29 -32.365 151.24 -2.181 7.533 15.633 1034 0.7841 4539.6 1573.2 381.38 | |
# the flow_1pc is 4539.6 which is close to the FFA results but | |
# putting the outlet and centroids back into the RFFE results in a very low output. | |
# It look like this is because the bias correction is so large | |
# 212040 KIALLA CREEK AT POMEROY | |
nsw_comp %>% | |
filter(station_id == 212040) %>% | |
select(station_name, flow_1pc, FFA_AEP_1, Area, Lat, lat, Lon, lon) | |
# # A tibble: 1 x 8 | |
# station_name flow_1pc FFA_AEP_1 Area Lat lat Lon lon | |
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> | |
# 1 Pomeroy 138 1819 96 -34.61 -34.61 149.54 149.54 | |
rffe_inputs %>% | |
filter(station_id == '212040') | |
# # A tibble: 1 x 42 | |
# station_id area record.length latc lonc lato lono bcf i2_6h i50_6h mar shape.factor flow_1pc flow_2pc flow_5pc flow_10pc | |
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> | |
# 1 212040 96 32 -34.560 149.50 -34.61 149.54 -0.963 5.35 10.6 703.7 0.6906 767.83 513.16 274.83 154.62 | |
# # | |
# Not obvious what the problem is with 212040 but the output 1% AEP value is much less than | |
# the At site values. The input values for this site is also smaller than the At-site FFA value. | |
# How much do we change the limits of agreement if we remove the outliers | |
ratio_stats_log <- | |
nsw_comp_ratio %>% | |
filter(! station_id %in% c(210084, 212040)) %>% | |
mutate(ratio_1pc_log = log10(ratio_1pc)) %>% | |
summarise(mean_difference_log = mean(ratio_1pc_log), | |
sd_diff_log = sd(ratio_1pc_log)) %>% | |
unlist() | |
ratio_stats_log | |
# mean_difference_log sd_diff_log | |
# -0.07510516 0.31743895 | |
10^ratio_stats_log["mean_difference_log"] | |
# mean_difference_log | |
# 0.8411914 | |
ULA <- as.vector(ratio_stats_log["mean_difference_log"] + 2* ratio_stats_log["sd_diff_log"]) # Upper Limit | |
ULA | |
# [1] 0.5597728 | |
10^ULA | |
#[1] [1] 3.628881 | |
LLA <- as.vector(ratio_stats_log["mean_difference_log"] - 2* ratio_stats_log["sd_diff_log"]) # Upper Limit | |
LLA | |
#[1] -0.7099831 | |
10^LLA | |
# [1] 0.1949921 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment