TonyLadson/RFFE_Comparison.R

## RFFE_Comparison.R
#Packages

library(tidyverse)
library(stringr)
library(dplyr)
library(readxl)
library(readr)
library(purrr)
library(ggrepel)
library(hydroGOF)


# Functions ---------------------------------------------------------------

# not in
"%ni%" <- function(x, y) x[!x %in% y]


# function to plot logarithmic minor breaks
# from https://stackoverflow.com/questions/30179442/plotting-minor-breaks-on-a-log-scale-with-ggplot


log_breaks = function(maj, radix=10) {
  function(x) {
    minx         = floor(min(logb(x,radix), na.rm=T)) - 1
    maxx         = ceiling(max(logb(x,radix), na.rm=T)) + 1
    n_major      = maxx - minx + 1
    major_breaks = seq(minx, maxx, by=1)
    if (maj) {
      breaks = major_breaks
    } else {
      steps = logb(1:(radix-1),radix)
      breaks = rep(steps, times=n_major) +
        rep(major_breaks, each=radix-1)
    }
    radix^breaks
  }


}
scale_x_log_eng = function(..., radix=10) {
  scale_x_continuous(...,
                     trans=scales::log_trans(radix),
                     breaks=log_breaks(TRUE, radix),
                     minor_breaks=log_breaks(FALSE, radix))
}
scale_y_log_eng = function(..., radix=10) {
  scale_y_continuous(...,
                     trans=scales::log_trans(radix),
                     breaks=log_breaks(TRUE, radix),
                     minor_breaks=log_breaks(FALSE, radix))
}


# RFFE  inputs ---------------------------------

# These have been obtained by scraping the RFFE
# Programatically visiting a large number of sites and
# downloading information on the particular site its its neighbours
# https://rffe.arr-software.org

# Data file RFFE-data-2_extra.csv is at:
#https://drive.google.com/file/d/100qEQlxsKNsEGcqdDHi8Fks5r0sPH-XR/view?usp=sharing

# setwd("~/Dropbox/R-repos/RFFE/R") # to get this working on my computer

fname <- file.path("../data","RFFE-data-2_extra.csv"  )
rffe_inputs <- read_csv(fname, skip = 0)


glimpse(rffe_inputs)

# Check missing data
colSums(is.na(rffe_inputs))

rffe_inputs %>%
  filter(is.na(statistics.mean)) %>%
  select(region.name) %>%
  count(region.name)

# Looks fine the missing values are associated with the arid zone gauges or the Pilbara


# Load information on RFFE gauges ---------------------------------

# From the Appendix A to the Stage 3 Project 5 report
# http://arr.ga.gov.au/__data/assets/pdf_file/0004/40468/ARR_Project-5_Stage-3_report.pdf
# Ataur Rahman,  Khaled Haddad, Md Mahmudul Haque, George Kuczera, Erwin Weinmann (2015)
# Project 5 Regional Flood Methods: Stage 3

# Data is available at
#https://drive.google.com/file/d/1k-LkAaXSUvoGrXMew1-yHRQOAkYHRlc5/view?usp=sharing

fname <- file.path("../data","RFFE-Gauges.xlsx" )
rffe_gauges <- read_excel(fname, skip = 1)

glimpse(rffe_gauges)

rffe_gauges %>%
  distinct(State)

# 1 NSW_ACT
# 2 Vic
# 3 SA
# 4 Tas
# 5 Qld
# 6 WA
# 7 NT
# 8 Arid


# We are only interested in NSW_ACT gauges.  There are also probably
# some NSW gauges in the Arid region but
# We'll leave these out.

rffe_gauges <- rffe_gauges %>%
  filter(State == "NSW_ACT") %>%
  mutate(station_id = as.character(Station_ID)) # Consistent capitalisation and type to other data

glimpse(rffe_gauges)

#colSums(is.na(rffe_gauges))
#nothing missing

# Load NSW RFFE outputs  ------------------------------------------------------


# Data is available from
# https://drive.google.com/file/d/16m2BderA9kOTk1Hrx1iGJh7_ZTBfp7d_/view?usp=sharing

fname <- file.path('../data', "RFFE_output_at_nsw_stations.csv" )

nsw_rffe_gauges_out <- read_csv(fname,
                                col_types = cols(.default = col_double(),
                                                 station_id = col_character(), # use character for consistency with other data
                                                 station_name = col_character()
                                ))


glimpse(nsw_rffe_gauges_out)

#colSums(is.na(nsw_rffe_gauges_out))
# Nothing missing


# WMA data ----------------------------------------------------------------

# At site flood frequency analysis results for ~160 NSW gauges
# Many of which coincide with the gauges used in the RFFE
# Reference
# Scott Podger, Mark Babister, Aaron Trim, Monique Retallick, Melissa Adam (2019)
# Review of ARR design inputs for NSW
# WMA water for office of environment and heritage
# These data come from Table C1 in Appendix C to the report

# Data is availalble from
# https://drive.google.com/file/d/15UoiuLjdgdySpo-4mJfvsuXlTBXj4Gl-/view?usp=sharing

fname <- file.path('../data','TableC1.xlsx' )

nsw_atsite_FFA <- read_xlsx(fname, skip = 5)

glimpse(nsw_atsite_FFA)

nsw_atsite_FFA <- nsw_atsite_FFA %>% # rename some columns so they are easier to work with
  rename(FFA_AEP_50 = `50% AEP`,
         FFA_AEP_20 = `20% AEP`,
         FFA_AEP_10 = `10% AEP`,
         FFA_AEP_5 = `5% AEP`,
         FFA_AEP_2 = `2% AEP`,
         FFA_AEP_1 = `1% AEP`) %>%
  mutate(station_id = as.character(station_id)) # change to character for consistency with other data


# Checking missing
colSums(is.na(nsw_atsite_FFA))
# 4 missing values
nsw_atsite_FFA %>%
  filter(is.na(station))

# station station_id station_name    lon     lat record_length ams_num FFA_AEP_50 FFA_AEP_20 FFA_AEP_10
# <chr>        <int> <chr>         <dbl>   <dbl>         <dbl>   <dbl>      <dbl>      <dbl>      <dbl>
# 1 NA              NA Wollondilly… 150.25 -34.222            57      57        473       1606       2475
# 2 NA              NA Nattai Case… 150.62 -34.222            42      42         88        310        492
# 3 NA              NA Nepean Rive… 150.42 -34.136            43      43        273        983       1525
# 4 NA              NA Cox River a… 150.25 -33.861            56      56        184        779       1279


# These extra sites were added by WMA water they are not included in the project 5 sites
# That is, they can be ignored

# Join -----------------------------------------------------------------

# Join the FFA estimates to the RFFE estimates

glimpse(nsw_rffe_gauges_out)
glimpse(nsw_atsite_FFA)

# join by station_id

nsw_comp <- nsw_rffe_gauges_out %>%
  left_join(nsw_atsite_FFA, by = 'station_id')

# Check the names are the same in both data sets

nsw_comp %>%
  filter(station_name.x != station_name.y)

# the names are all the same so remove one and rename the other

nsw_comp <- nsw_comp %>%
  select(-station_name.y) %>%
  rename(station_name = station_name.x)


# join information on the RFFE gauges

glimpse(rffe_gauges)

nsw_comp <- nsw_comp %>%
  left_join(rffe_gauges, by = "station_id")

glimpse(nsw_comp)

# check that the station names are the same
# along with the lats and lons

options(pillar.sigfig = 5) # Would like to see more digits for the comparison

nsw_comp %>%
  filter(station_name != Station_Name) %>%
  select(station_id, station_name, Station_Name, Lat, lat, Lon, lon, Area) # the capitalised and
# non-capitalised values come from different files

# # A tibble: 12 x 8
# station_id station_name                    Station_Name                  Lat     lat    Lon    lon  Area
# <int> <chr>                           <chr>                       <dbl>   <dbl>  <dbl>  <dbl> <dbl>
# 1     219017 near Brogo                      Near Brogo                 -36.6  -36.598 149.81 149.81   152
# 2     219015 near Bermagui                   Near Bermagui              -36.43 -36.431 150.01 150.00    31
# 3     219001 Brown Mt                        Brown Mountain             -36.6  -36.597 149.44 149.44    15
# 4     216004 Falls Creek                     Falls Ck                   -34.97 -34.968 150.6  150.60    95
# 5     410156 Book                            Book Book                  -35.35 -35.353 147.55 147.55   145
# 6     410038 Butmaroo                        Darbalara                  -35.02 -35.02  148.25 148.25   411
# 7     208026 Forbesdale (Causeway)           Jacky Barkers              -31.64 -32.038 151.74 151.87   560
# 8     210044 Middle Falbrook (Fal Dam sight) Middle Falbrook(Fal Dam Si -32.45 -32.45  151.15 151.15   466
# 9     211010 U/S Wyongh R (Durren La)        U/S Wyong R (Durren La)    -33.25 -33.244 151.39 151.39    92
# 10     204056 Gibrattar Range                 Gibraltar Range            -29.49 -29.49  152.45 152.45   104
# 11     204036 Sandy Hill(below Snake Creek)   Sandy Hill(below Snake Cre -28.93 -28.93  152.22 152.22   236
# 12     203014 Etham                           Eltham                     -28.76 -28.756 153.4  153.40   223
#                                                                                                                       >

# These all look like the same sites in each file, apart from possibily 208026

# Comparison 1: Compare the RFFE output with the FFA results -----------------------------------------------------------------


glimpse(nsw_comp)

# nash-sutcliffe
hydroGOF::NSE(sim = nsw_comp$flow_1pc, obs = nsw_comp$FFA_AEP_1)
#[1] -1.291951

hydroGOF::rmse(sim = nsw_comp$flow_1pc, obs = nsw_comp$FFA_AEP_1)
#[1] 1459.073

hydroGOF::NSE(sim = log10(nsw_comp$flow_1pc), obs = log10(nsw_comp$FFA_AEP_1))
# [1] 0.3914661

# Bias

nsw_comp %>%
  summarise(mean(FFA_AEP_1 - flow_1pc))


nsw_comp %>%
  ggplot(aes(flow_1pc, FFA_AEP_1)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
  geom_smooth(colour = 'blue', se = FALSE) +
  scale_y_continuous(name = "1% AEP using At-site FFA",
                     label = scales::comma,
                     limits = c(0,9000),
                     breaks = seq(0,9000,1000)) +
  scale_x_continuous(name = "1% AEP estimate from the RFFE",
                     label = scales::comma,
                     limits = c(0,9000),
                     breaks = seq(0,9000,1000))


# Make a web version of the figure
p <- nsw_comp %>%
  ggplot(aes(x = flow_1pc, y = FFA_AEP_1)) +
  geom_point(size = 0.2) +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed', size = 0.1) +
  geom_smooth(colour = 'blue', se = FALSE, size = 0.1) +
  scale_y_continuous(name = bquote('1% AEP from at-site FFA (m' ^3 *'s' ^-1 *')'),
                     label = scales::comma,
                     limits = c(0,9000),
                     breaks = seq(0,9000,1000)) +
  scale_x_continuous(name = bquote("1% AEP estimate from the RFFE (m" ^3 *'s' ^-1 *')'),
                     label = scales::comma,
                     limits = c(0,9000),
                     breaks = seq(0,9000,1000)) +
  theme_grey(base_size = 5)
p

ggsave(file.path('../figures', 'FFA-RFFEout.png'), p, width = 4, height = 3)


# Log transformed version

p <- nsw_comp %>%
  ggplot(aes( x = flow_1pc, y = FFA_AEP_1)) +
  geom_point(size = 0.2) +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed', size = 0.1) +
  geom_smooth(colour = 'blue', se = FALSE, size = 0.1) +
  scale_y_log_eng(name = bquote("1% AEP from at-site FFA (m" ^3 *'s' ^-1 *')'),
                  limits = c(NA, 10000)) +
  scale_x_log_eng(name = bquote("1% AEP estimate from the RFFE (m" ^3 *'s' ^-1 *')')) +
  theme_grey(base_size = 5)

p

ggsave(file.path('../figures', 'FFA-RFFEout_log.png'), p, width = 4, height = 3)


# Mean Difference Plot
# Alternative analysis based on ratios
# Following Bland and Altman 1999


# The correct way to calculate the ratio is y/yhat
# In this case y = the at-site values (FFA)
# yhat are the RFFE values

# Residuals are usually y - yhat
# logy - logyhat = log(y/yhat)

# # Add logs of 1% values
# nsw_comp_log <- nsw_comp %>%
#   mutate(FFA_AEP_1_log = log10(FFA_AEP_1),
#          flow_1pc_log = log10(flow_1pc),
#          diff_log = FFA_AEP_1_log - flow_1pc_log,
#          mean_log = 0.5 * (FFA_AEP_1_log + FFA_AEP_1_log))

# Add ratios to dataframe
nsw_comp_ratio <- nsw_comp %>%
  mutate(#ratio_1pc = flow_1pc/FFA_AEP_1,
    ratio_1pc = FFA_AEP_1/flow_1pc, # use the ratio y/yhat
    mean_1pc = 0.5*(flow_1pc + FFA_AEP_1))


# Calculate statistics of ratios
ratio_stats_log <-
  nsw_comp_ratio %>%
  mutate(ratio_1pc_log = log10(ratio_1pc)) %>%
  summarise(mean_difference_log = mean(ratio_1pc_log),
            sd_diff_log = sd(ratio_1pc_log)) %>%
  unlist()

ratio_stats_log
# mean_difference_log         sd_diff_log
# -0.05677213          0.35657627

10^ratio_stats_log["mean_difference_log"]
# mean_difference_log
# 0.8774611

ULA <- as.vector(ratio_stats_log["mean_difference_log"] + 2* ratio_stats_log["sd_diff_log"]) # Upper Limit
ULA
# [1] 0.6563804
10^ULA
#[1] 4.532945

LLA <- as.vector(ratio_stats_log["mean_difference_log"] - 2* ratio_stats_log["sd_diff_log"]) # Upper Limit
LLA
#[1] -0.7699247
10^LLA
# [1] 0.1698538

# These are the same answers as calculated when log-tranforming the differences.


# make a data frame for labels

nsw_comp_ratio_labels <- nsw_comp_ratio %>%
  filter(ratio_1pc > 4.53)


# Add ratios to dataframe and plot
nsw_comp_ratio %>%
  ggplot(aes(x = mean_1pc, y = ratio_1pc)) +
  geom_point() +
  scale_y_log_eng(name = 'Ratio (FFA/RFFE)',
                  limits = c(0.1, 100)) +
  scale_x_continuous(name = 'Average of AEP 1% estimates') +
  geom_hline(yintercept = 10^LLA, linetype = 'dashed') + # Upper confidence interval
  geom_hline(yintercept = 10^ULA, linetype = 'dashed') +  # Lower confidence interval
  geom_hline(yintercept = 10^ratio_stats_log["mean_difference_log"], linetype = 'dashed') + # mean
  geom_text(data = nsw_comp_ratio_labels,
            aes(x = mean_1pc, y = ratio_1pc, label = station_id),
            nudge_x = -40,
            hjust = 'right') +
  annotate(geom = "text", x = 4500, y = 10^ULA + 1, label = sprintf("mean + 2SD = %.2f", 10^ULA), hjust = 'left') +
  annotate(geom = "text", x = 4500, y = 10^LLA - 0.02, label = sprintf("mean - 2SD = %.2f", 10^LLA), hjust = 'left') +
  annotate(geom = "text", x = 4500,
           y = 10^ratio_stats_log['mean_difference_log'] + 0.2,
           label = sprintf("bias = %.2f", 10^ratio_stats_log['mean_difference_log']),
           hjust = 'left')


# Make a web version

# Everything needs to be scaled down
my_size = 0.1
my_text_size = 2

p <- nsw_comp_ratio %>%
  ggplot(aes(x = mean_1pc, y = ratio_1pc)) +
  geom_point(size = my_size) +
  scale_y_log_eng(name = 'Ratio (FFA/RFFE)',
                  limits = c(0.1, 100)) +
  scale_x_continuous(name = 'Average of AEP 1% estimates (cumec)', label = scales::comma) +
  geom_hline(yintercept = 10^LLA, linetype = 'dashed', size = my_size) + # Upper confidence interval
  geom_hline(yintercept = 10^ULA, linetype = 'dashed', size = my_size) +  # Lower confidence interval
  geom_hline(yintercept = 10^ratio_stats_log["mean_difference_log"], linetype = 'dashed', size = my_size) + # mean
  geom_text(data = nsw_comp_ratio_labels,
            aes(x = mean_1pc, y = ratio_1pc, label = station_id),
            nudge_x = -50,
            hjust = 'right',
            size = my_text_size) +
  annotate(geom = "text", x = 4500,
           y = 10^ULA + 1,
           label = sprintf("mean + 2SD = %.2f", 10^ULA),
           hjust = 'left',
           size = my_text_size) +
  annotate(geom = "text", x = 4500,
           y = 10^LLA - 0.02,
           label = sprintf("mean - 2SD = %.2f", 10^LLA),
           hjust = 'left',
           size = my_text_size) +
  annotate(geom = "text", x = 4500,
           y = 10^ratio_stats_log['mean_difference_log'] + 0.2,
           label = sprintf("bias = %.2f", 10^ratio_stats_log['mean_difference_log']),
           hjust = 'left',
           size = my_text_size) +
  theme_grey(base_size = 7)
p
ggsave(file.path('../figures', 'FFA-RFFE_ratio.png'), p, width = 4, height = 3)

# Checking outliers
# Looking at BoM http://www.bom.gov.au/waterdata/

# 212040 KIALLA CREEK AT POMEROY -34.61 149.54 1%AEP = 110 cumec(GEV)
# 210084 GLENNIES CREEK AT THE ROCKS NO.2 -32.36 	151.24 1%AEP = 700-1500 cumec(GEV)
# 210084 is d/s of Glennies Ck Dam


# Checking on the Glennies Ck Site

nsw_comp %>%
  filter(station_id == 210084) %>%
  select(station_name, flow_1pc, FFA_AEP_1, Area, Lat, lat, Lon, lon)

# # A tibble: 1 x 8
# station_name   flow_1pc FFA_AEP_1  Area     Lat     lat    Lon    lon
# <chr>             <dbl>     <dbl> <dbl>   <dbl>   <dbl>  <dbl>  <dbl>
#   1 The Rocks No.2      130      5985   227 -32.370 -32.365 151.24 151.24

# Issues is that flow_1pc is << FFA_AEP_1 i.e. the 1% flow returned by
# the RFFE tool is much less than the 1% At-site values calculated by WMA water

# Check the RFFE input for 210084
rffe_inputs %>%
  filter(station_id == '210084')

# # A tibble: 1 x 42
#     station_id  area record.length    latc   lonc    lato   lono    bcf i2_6h i50_6h   mar shape.factor flow_1pc flow_2pc flow_5pc
#      <chr>      <dbl>         <dbl>   <dbl>  <dbl>   <dbl>  <dbl>  <dbl> <dbl>  <dbl> <dbl>        <dbl>    <dbl>    <dbl>    <dbl>
#   1 210084       227            38 -32.270 151.29 -32.365 151.24 -2.181 7.533 15.633  1034       0.7841   4539.6   1573.2   381.38

# the flow_1pc is 4539.6 which is close to the FFA results but
# putting the outlet and centroids back into the RFFE results in a very low output.
# It look like this is because the bias correction is so large


# 212040 KIALLA CREEK AT POMEROY

nsw_comp %>%
  filter(station_id == 212040) %>%
  select(station_name, flow_1pc, FFA_AEP_1, Area, Lat, lat, Lon, lon)

# # A tibble: 1 x 8
# station_name flow_1pc FFA_AEP_1  Area    Lat    lat    Lon    lon
# <chr>           <dbl>     <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
# 1 Pomeroy           138      1819    96 -34.61 -34.61 149.54 149.54


rffe_inputs %>%
  filter(station_id == '212040')

# # A tibble: 1 x 42
#       station_id  area record.length    latc   lonc   lato   lono    bcf i2_6h i50_6h   mar shape.factor flow_1pc flow_2pc flow_5pc flow_10pc
#     <chr>      <dbl>         <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl> <dbl>  <dbl> <dbl>        <dbl>    <dbl>    <dbl>    <dbl>     <dbl>
#   1 212040        96            32 -34.560 149.50 -34.61 149.54 -0.963  5.35   10.6 703.7       0.6906   767.83   513.16   274.83    154.62
# #

# Not obvious what the problem is with 212040 but the output 1% AEP value is much less than
# the At site values.  The input values for this site is also smaller than the At-site FFA value.

# How much do we change the limits of agreement if we remove the outliers

ratio_stats_log <-
  nsw_comp_ratio %>%
  filter(! station_id %in% c(210084, 212040)) %>%
  mutate(ratio_1pc_log = log10(ratio_1pc)) %>%
  summarise(mean_difference_log = mean(ratio_1pc_log),
            sd_diff_log = sd(ratio_1pc_log)) %>%
  unlist()

ratio_stats_log
# mean_difference_log         sd_diff_log
# -0.07510516          0.31743895

10^ratio_stats_log["mean_difference_log"]
# mean_difference_log
# 0.8411914

ULA <- as.vector(ratio_stats_log["mean_difference_log"] + 2* ratio_stats_log["sd_diff_log"]) # Upper Limit
ULA
# [1] 0.5597728
10^ULA
#[1] [1] 3.628881

LLA <- as.vector(ratio_stats_log["mean_difference_log"] - 2* ratio_stats_log["sd_diff_log"]) # Upper Limit
LLA
#[1]  -0.7099831
10^LLA
# [1] 0.1949921
	#Packages

	library(tidyverse)
	library(stringr)
	library(dplyr)
	library(readxl)
	library(readr)
	library(purrr)
	library(ggrepel)
	library(hydroGOF)


	# Functions ---------------------------------------------------------------

	# not in
	"%ni%" <- function(x, y) x[!x %in% y]


	# function to plot logarithmic minor breaks
	# from https://stackoverflow.com/questions/30179442/plotting-minor-breaks-on-a-log-scale-with-ggplot


	log_breaks = function(maj, radix=10) {
	function(x) {
	minx = floor(min(logb(x,radix), na.rm=T)) - 1
	maxx = ceiling(max(logb(x,radix), na.rm=T)) + 1
	n_major = maxx - minx + 1
	major_breaks = seq(minx, maxx, by=1)
	if (maj) {
	breaks = major_breaks
	} else {
	steps = logb(1:(radix-1),radix)
	breaks = rep(steps, times=n_major) +
	rep(major_breaks, each=radix-1)
	}
	radix^breaks
	}




	}
	scale_x_log_eng = function(..., radix=10) {
	scale_x_continuous(...,
	trans=scales::log_trans(radix),
	breaks=log_breaks(TRUE, radix),
	minor_breaks=log_breaks(FALSE, radix))
	}
	scale_y_log_eng = function(..., radix=10) {
	scale_y_continuous(...,
	trans=scales::log_trans(radix),
	breaks=log_breaks(TRUE, radix),
	minor_breaks=log_breaks(FALSE, radix))
	}







	# RFFE inputs ---------------------------------

	# These have been obtained by scraping the RFFE
	# Programatically visiting a large number of sites and
	# downloading information on the particular site its its neighbours
	# https://rffe.arr-software.org

	# Data file RFFE-data-2_extra.csv is at:
	#https://drive.google.com/file/d/100qEQlxsKNsEGcqdDHi8Fks5r0sPH-XR/view?usp=sharing

	# setwd("~/Dropbox/R-repos/RFFE/R") # to get this working on my computer

	fname <- file.path("../data","RFFE-data-2_extra.csv" )
	rffe_inputs <- read_csv(fname, skip = 0)




	glimpse(rffe_inputs)

	# Check missing data
	colSums(is.na(rffe_inputs))

	rffe_inputs %>%
	filter(is.na(statistics.mean)) %>%
	select(region.name) %>%
	count(region.name)

	# Looks fine the missing values are associated with the arid zone gauges or the Pilbara





	# Load information on RFFE gauges ---------------------------------

	# From the Appendix A to the Stage 3 Project 5 report
	# http://arr.ga.gov.au/__data/assets/pdf_file/0004/40468/ARR_Project-5_Stage-3_report.pdf
	# Ataur Rahman, Khaled Haddad, Md Mahmudul Haque, George Kuczera, Erwin Weinmann (2015)
	# Project 5 Regional Flood Methods: Stage 3

	# Data is available at
	#https://drive.google.com/file/d/1k-LkAaXSUvoGrXMew1-yHRQOAkYHRlc5/view?usp=sharing

	fname <- file.path("../data","RFFE-Gauges.xlsx" )
	rffe_gauges <- read_excel(fname, skip = 1)

	glimpse(rffe_gauges)

	rffe_gauges %>%
	distinct(State)

	# 1 NSW_ACT
	# 2 Vic
	# 3 SA
	# 4 Tas
	# 5 Qld
	# 6 WA
	# 7 NT
	# 8 Arid


	# We are only interested in NSW_ACT gauges. There are also probably
	# some NSW gauges in the Arid region but
	# We'll leave these out.

	rffe_gauges <- rffe_gauges %>%
	filter(State == "NSW_ACT") %>%
	mutate(station_id = as.character(Station_ID)) # Consistent capitalisation and type to other data

	glimpse(rffe_gauges)

	#colSums(is.na(rffe_gauges))
	#nothing missing

	# Load NSW RFFE outputs ------------------------------------------------------


	# Data is available from
	# https://drive.google.com/file/d/16m2BderA9kOTk1Hrx1iGJh7_ZTBfp7d_/view?usp=sharing

	fname <- file.path('../data', "RFFE_output_at_nsw_stations.csv" )

	nsw_rffe_gauges_out <- read_csv(fname,
	col_types = cols(.default = col_double(),
	station_id = col_character(), # use character for consistency with other data
	station_name = col_character()
	))



	glimpse(nsw_rffe_gauges_out)

	#colSums(is.na(nsw_rffe_gauges_out))
	# Nothing missing




	# WMA data ----------------------------------------------------------------

	# At site flood frequency analysis results for ~160 NSW gauges
	# Many of which coincide with the gauges used in the RFFE
	# Reference
	# Scott Podger, Mark Babister, Aaron Trim, Monique Retallick, Melissa Adam (2019)
	# Review of ARR design inputs for NSW
	# WMA water for office of environment and heritage
	# These data come from Table C1 in Appendix C to the report

	# Data is availalble from
	# https://drive.google.com/file/d/15UoiuLjdgdySpo-4mJfvsuXlTBXj4Gl-/view?usp=sharing

	fname <- file.path('../data','TableC1.xlsx' )

	nsw_atsite_FFA <- read_xlsx(fname, skip = 5)

	glimpse(nsw_atsite_FFA)

	nsw_atsite_FFA <- nsw_atsite_FFA %>% # rename some columns so they are easier to work with
	rename(FFA_AEP_50 = `50% AEP`,
	FFA_AEP_20 = `20% AEP`,
	FFA_AEP_10 = `10% AEP`,
	FFA_AEP_5 = `5% AEP`,
	FFA_AEP_2 = `2% AEP`,
	FFA_AEP_1 = `1% AEP`) %>%
	mutate(station_id = as.character(station_id)) # change to character for consistency with other data


	# Checking missing
	colSums(is.na(nsw_atsite_FFA))
	# 4 missing values
	nsw_atsite_FFA %>%
	filter(is.na(station))

	# station station_id station_name lon lat record_length ams_num FFA_AEP_50 FFA_AEP_20 FFA_AEP_10
	# <chr> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
	# 1 NA NA Wollondilly… 150.25 -34.222 57 57 473 1606 2475
	# 2 NA NA Nattai Case… 150.62 -34.222 42 42 88 310 492
	# 3 NA NA Nepean Rive… 150.42 -34.136 43 43 273 983 1525
	# 4 NA NA Cox River a… 150.25 -33.861 56 56 184 779 1279


	# These extra sites were added by WMA water they are not included in the project 5 sites
	# That is, they can be ignored

	# Join -----------------------------------------------------------------

	# Join the FFA estimates to the RFFE estimates

	glimpse(nsw_rffe_gauges_out)
	glimpse(nsw_atsite_FFA)

	# join by station_id

	nsw_comp <- nsw_rffe_gauges_out %>%
	left_join(nsw_atsite_FFA, by = 'station_id')

	# Check the names are the same in both data sets

	nsw_comp %>%
	filter(station_name.x != station_name.y)

	# the names are all the same so remove one and rename the other

	nsw_comp <- nsw_comp %>%
	select(-station_name.y) %>%
	rename(station_name = station_name.x)


	# join information on the RFFE gauges

	glimpse(rffe_gauges)

	nsw_comp <- nsw_comp %>%
	left_join(rffe_gauges, by = "station_id")

	glimpse(nsw_comp)

	# check that the station names are the same
	# along with the lats and lons

	options(pillar.sigfig = 5) # Would like to see more digits for the comparison

	nsw_comp %>%
	filter(station_name != Station_Name) %>%
	select(station_id, station_name, Station_Name, Lat, lat, Lon, lon, Area) # the capitalised and
	# non-capitalised values come from different files

	# # A tibble: 12 x 8
	# station_id station_name Station_Name Lat lat Lon lon Area
	# <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
	# 1 219017 near Brogo Near Brogo -36.6 -36.598 149.81 149.81 152
	# 2 219015 near Bermagui Near Bermagui -36.43 -36.431 150.01 150.00 31
	# 3 219001 Brown Mt Brown Mountain -36.6 -36.597 149.44 149.44 15
	# 4 216004 Falls Creek Falls Ck -34.97 -34.968 150.6 150.60 95
	# 5 410156 Book Book Book -35.35 -35.353 147.55 147.55 145
	# 6 410038 Butmaroo Darbalara -35.02 -35.02 148.25 148.25 411
	# 7 208026 Forbesdale (Causeway) Jacky Barkers -31.64 -32.038 151.74 151.87 560
	# 8 210044 Middle Falbrook (Fal Dam sight) Middle Falbrook(Fal Dam Si -32.45 -32.45 151.15 151.15 466
	# 9 211010 U/S Wyongh R (Durren La) U/S Wyong R (Durren La) -33.25 -33.244 151.39 151.39 92
	# 10 204056 Gibrattar Range Gibraltar Range -29.49 -29.49 152.45 152.45 104
	# 11 204036 Sandy Hill(below Snake Creek) Sandy Hill(below Snake Cre -28.93 -28.93 152.22 152.22 236
	# 12 203014 Etham Eltham -28.76 -28.756 153.4 153.40 223
	# >

	# These all look like the same sites in each file, apart from possibily 208026

	# Comparison 1: Compare the RFFE output with the FFA results -----------------------------------------------------------------


	glimpse(nsw_comp)

	# nash-sutcliffe
	hydroGOF::NSE(sim = nsw_comp$flow_1pc, obs = nsw_comp$FFA_AEP_1)
	#[1] -1.291951

	hydroGOF::rmse(sim = nsw_comp$flow_1pc, obs = nsw_comp$FFA_AEP_1)
	#[1] 1459.073

	hydroGOF::NSE(sim = log10(nsw_comp$flow_1pc), obs = log10(nsw_comp$FFA_AEP_1))
	# [1] 0.3914661

	# Bias

	nsw_comp %>%
	summarise(mean(FFA_AEP_1 - flow_1pc))



	nsw_comp %>%
	ggplot(aes(flow_1pc, FFA_AEP_1)) +
	geom_point() +
	geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
	geom_smooth(colour = 'blue', se = FALSE) +
	scale_y_continuous(name = "1% AEP using At-site FFA",
	label = scales::comma,
	limits = c(0,9000),
	breaks = seq(0,9000,1000)) +
	scale_x_continuous(name = "1% AEP estimate from the RFFE",
	label = scales::comma,
	limits = c(0,9000),
	breaks = seq(0,9000,1000))





	# Make a web version of the figure
	p <- nsw_comp %>%
	ggplot(aes(x = flow_1pc, y = FFA_AEP_1)) +
	geom_point(size = 0.2) +
	geom_abline(slope = 1, intercept = 0, linetype = 'dashed', size = 0.1) +
	geom_smooth(colour = 'blue', se = FALSE, size = 0.1) +
	scale_y_continuous(name = bquote('1% AEP from at-site FFA (m' ^3 's' ^-1 ')'),
	label = scales::comma,
	limits = c(0,9000),
	breaks = seq(0,9000,1000)) +
	scale_x_continuous(name = bquote("1% AEP estimate from the RFFE (m" ^3 's' ^-1 ')'),
	label = scales::comma,
	limits = c(0,9000),
	breaks = seq(0,9000,1000)) +
	theme_grey(base_size = 5)
	p

	ggsave(file.path('../figures', 'FFA-RFFEout.png'), p, width = 4, height = 3)


	# Log transformed version

	p <- nsw_comp %>%
	ggplot(aes( x = flow_1pc, y = FFA_AEP_1)) +
	geom_point(size = 0.2) +
	geom_abline(slope = 1, intercept = 0, linetype = 'dashed', size = 0.1) +
	geom_smooth(colour = 'blue', se = FALSE, size = 0.1) +
	scale_y_log_eng(name = bquote("1% AEP from at-site FFA (m" ^3 's' ^-1 ')'),
	limits = c(NA, 10000)) +
	scale_x_log_eng(name = bquote("1% AEP estimate from the RFFE (m" ^3 's' ^-1 ')')) +
	theme_grey(base_size = 5)

	p

	ggsave(file.path('../figures', 'FFA-RFFEout_log.png'), p, width = 4, height = 3)



	# Mean Difference Plot
	# Alternative analysis based on ratios
	# Following Bland and Altman 1999


	# The correct way to calculate the ratio is y/yhat
	# In this case y = the at-site values (FFA)
	# yhat are the RFFE values

	# Residuals are usually y - yhat
	# logy - logyhat = log(y/yhat)

	# # Add logs of 1% values
	# nsw_comp_log <- nsw_comp %>%
	# mutate(FFA_AEP_1_log = log10(FFA_AEP_1),
	# flow_1pc_log = log10(flow_1pc),
	# diff_log = FFA_AEP_1_log - flow_1pc_log,
	# mean_log = 0.5 * (FFA_AEP_1_log + FFA_AEP_1_log))

	# Add ratios to dataframe
	nsw_comp_ratio <- nsw_comp %>%
	mutate(#ratio_1pc = flow_1pc/FFA_AEP_1,
	ratio_1pc = FFA_AEP_1/flow_1pc, # use the ratio y/yhat
	mean_1pc = 0.5*(flow_1pc + FFA_AEP_1))


	# Calculate statistics of ratios
	ratio_stats_log <-
	nsw_comp_ratio %>%
	mutate(ratio_1pc_log = log10(ratio_1pc)) %>%
	summarise(mean_difference_log = mean(ratio_1pc_log),
	sd_diff_log = sd(ratio_1pc_log)) %>%
	unlist()

	ratio_stats_log
	# mean_difference_log sd_diff_log
	# -0.05677213 0.35657627

	10^ratio_stats_log["mean_difference_log"]
	# mean_difference_log
	# 0.8774611

	ULA <- as.vector(ratio_stats_log["mean_difference_log"] + 2* ratio_stats_log["sd_diff_log"]) # Upper Limit
	ULA
	# [1] 0.6563804
	10^ULA
	#[1] 4.532945

	LLA <- as.vector(ratio_stats_log["mean_difference_log"] - 2* ratio_stats_log["sd_diff_log"]) # Upper Limit
	LLA
	#[1] -0.7699247
	10^LLA
	# [1] 0.1698538

	# These are the same answers as calculated when log-tranforming the differences.



	# make a data frame for labels

	nsw_comp_ratio_labels <- nsw_comp_ratio %>%
	filter(ratio_1pc > 4.53)


	# Add ratios to dataframe and plot
	nsw_comp_ratio %>%
	ggplot(aes(x = mean_1pc, y = ratio_1pc)) +
	geom_point() +
	scale_y_log_eng(name = 'Ratio (FFA/RFFE)',
	limits = c(0.1, 100)) +
	scale_x_continuous(name = 'Average of AEP 1% estimates') +
	geom_hline(yintercept = 10^LLA, linetype = 'dashed') + # Upper confidence interval
	geom_hline(yintercept = 10^ULA, linetype = 'dashed') + # Lower confidence interval
	geom_hline(yintercept = 10^ratio_stats_log["mean_difference_log"], linetype = 'dashed') + # mean
	geom_text(data = nsw_comp_ratio_labels,
	aes(x = mean_1pc, y = ratio_1pc, label = station_id),
	nudge_x = -40,
	hjust = 'right') +
	annotate(geom = "text", x = 4500, y = 10^ULA + 1, label = sprintf("mean + 2SD = %.2f", 10^ULA), hjust = 'left') +
	annotate(geom = "text", x = 4500, y = 10^LLA - 0.02, label = sprintf("mean - 2SD = %.2f", 10^LLA), hjust = 'left') +
	annotate(geom = "text", x = 4500,
	y = 10^ratio_stats_log['mean_difference_log'] + 0.2,
	label = sprintf("bias = %.2f", 10^ratio_stats_log['mean_difference_log']),
	hjust = 'left')


	# Make a web version

	# Everything needs to be scaled down
	my_size = 0.1
	my_text_size = 2

	p <- nsw_comp_ratio %>%
	ggplot(aes(x = mean_1pc, y = ratio_1pc)) +
	geom_point(size = my_size) +
	scale_y_log_eng(name = 'Ratio (FFA/RFFE)',
	limits = c(0.1, 100)) +
	scale_x_continuous(name = 'Average of AEP 1% estimates (cumec)', label = scales::comma) +
	geom_hline(yintercept = 10^LLA, linetype = 'dashed', size = my_size) + # Upper confidence interval
	geom_hline(yintercept = 10^ULA, linetype = 'dashed', size = my_size) + # Lower confidence interval
	geom_hline(yintercept = 10^ratio_stats_log["mean_difference_log"], linetype = 'dashed', size = my_size) + # mean
	geom_text(data = nsw_comp_ratio_labels,
	aes(x = mean_1pc, y = ratio_1pc, label = station_id),
	nudge_x = -50,
	hjust = 'right',
	size = my_text_size) +
	annotate(geom = "text", x = 4500,
	y = 10^ULA + 1,
	label = sprintf("mean + 2SD = %.2f", 10^ULA),
	hjust = 'left',
	size = my_text_size) +
	annotate(geom = "text", x = 4500,
	y = 10^LLA - 0.02,
	label = sprintf("mean - 2SD = %.2f", 10^LLA),
	hjust = 'left',
	size = my_text_size) +
	annotate(geom = "text", x = 4500,
	y = 10^ratio_stats_log['mean_difference_log'] + 0.2,
	label = sprintf("bias = %.2f", 10^ratio_stats_log['mean_difference_log']),
	hjust = 'left',
	size = my_text_size) +
	theme_grey(base_size = 7)
	p
	ggsave(file.path('../figures', 'FFA-RFFE_ratio.png'), p, width = 4, height = 3)

	# Checking outliers
	# Looking at BoM http://www.bom.gov.au/waterdata/

	# 212040 KIALLA CREEK AT POMEROY -34.61 149.54 1%AEP = 110 cumec(GEV)
	# 210084 GLENNIES CREEK AT THE ROCKS NO.2 -32.36 151.24 1%AEP = 700-1500 cumec(GEV)
	# 210084 is d/s of Glennies Ck Dam


	# Checking on the Glennies Ck Site

	nsw_comp %>%
	filter(station_id == 210084) %>%
	select(station_name, flow_1pc, FFA_AEP_1, Area, Lat, lat, Lon, lon)

	# # A tibble: 1 x 8
	# station_name flow_1pc FFA_AEP_1 Area Lat lat Lon lon
	# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
	# 1 The Rocks No.2 130 5985 227 -32.370 -32.365 151.24 151.24

	# Issues is that flow_1pc is << FFA_AEP_1 i.e. the 1% flow returned by
	# the RFFE tool is much less than the 1% At-site values calculated by WMA water

	# Check the RFFE input for 210084
	rffe_inputs %>%
	filter(station_id == '210084')

	# # A tibble: 1 x 42
	# station_id area record.length latc lonc lato lono bcf i2_6h i50_6h mar shape.factor flow_1pc flow_2pc flow_5pc
	# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
	# 1 210084 227 38 -32.270 151.29 -32.365 151.24 -2.181 7.533 15.633 1034 0.7841 4539.6 1573.2 381.38

	# the flow_1pc is 4539.6 which is close to the FFA results but
	# putting the outlet and centroids back into the RFFE results in a very low output.
	# It look like this is because the bias correction is so large


	# 212040 KIALLA CREEK AT POMEROY

	nsw_comp %>%
	filter(station_id == 212040) %>%
	select(station_name, flow_1pc, FFA_AEP_1, Area, Lat, lat, Lon, lon)

	# # A tibble: 1 x 8
	# station_name flow_1pc FFA_AEP_1 Area Lat lat Lon lon
	# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
	# 1 Pomeroy 138 1819 96 -34.61 -34.61 149.54 149.54


	rffe_inputs %>%
	filter(station_id == '212040')

	# # A tibble: 1 x 42
	# station_id area record.length latc lonc lato lono bcf i2_6h i50_6h mar shape.factor flow_1pc flow_2pc flow_5pc flow_10pc
	# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
	# 1 212040 96 32 -34.560 149.50 -34.61 149.54 -0.963 5.35 10.6 703.7 0.6906 767.83 513.16 274.83 154.62
	# #

	# Not obvious what the problem is with 212040 but the output 1% AEP value is much less than
	# the At site values. The input values for this site is also smaller than the At-site FFA value.

	# How much do we change the limits of agreement if we remove the outliers

	ratio_stats_log <-
	nsw_comp_ratio %>%
	filter(! station_id %in% c(210084, 212040)) %>%
	mutate(ratio_1pc_log = log10(ratio_1pc)) %>%
	summarise(mean_difference_log = mean(ratio_1pc_log),
	sd_diff_log = sd(ratio_1pc_log)) %>%
	unlist()

	ratio_stats_log
	# mean_difference_log sd_diff_log
	# -0.07510516 0.31743895

	10^ratio_stats_log["mean_difference_log"]
	# mean_difference_log
	# 0.8411914

	ULA <- as.vector(ratio_stats_log["mean_difference_log"] + 2* ratio_stats_log["sd_diff_log"]) # Upper Limit
	ULA
	# [1] 0.5597728
	10^ULA
	#[1] [1] 3.628881

	LLA <- as.vector(ratio_stats_log["mean_difference_log"] - 2* ratio_stats_log["sd_diff_log"]) # Upper Limit
	LLA
	#[1] -0.7099831
	10^LLA
	# [1] 0.1949921