Skip to content

Instantly share code, notes, and snippets.

@TonyLadson
Last active June 9, 2016 01:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TonyLadson/2d63ca70eef92583001dece607127759 to your computer and use it in GitHub Desktop.
Save TonyLadson/2d63ca70eef92583001dece607127759 to your computer and use it in GitHub Desktop.
###########################################################################################
#
# Fitting non-linear models in R
# See tonyladson
#
# 5 June 2016
###############################################################################################
#
# Example data come from Figure 5 from Ian Cordery's thesis
# Also Figure 3 from Cordery, 1970
# Cordery, I. (1968). Improved techniques in flood estimation and forecasting.
# PhD, The University of New South Wales.
#
# Cordery, I. (1970). "Initial loss for flood estimation and forecasting."
# Journal of the Hydraulics Division, American Society of Civil Engineers 96(12): 2447-2466.
#remove(list = objects())
library(ggplot2)
library(dplyr)
library(propagate)
library(nlstools)
library(nls2) # this loads the proto package which is used to calcualte confidence intervals
library(fpp) # for the Acf function
library(car)
library(ggExtra)
# Data from digitizing Figure 5 from Cordery, 1968
# See http://www.unsworks.unsw.edu.au/primo_library/libweb/action/dlDisplay.do?vid=UNSWORKS&docId=unsworks_36997
Fig5 <-
structure(list(x = c(9.64021986421365, 37.5464569595027, 25.7273447779685,
46.7390997673626, 104.193117316487, 108.132821376998, 86.1592245759561,
81.2589611595452, 76.0060224121967, 93.4063820127887, 79.6033107877613,
90.7658056258769, 120.32769309295, 111.463358956799, 125.908940512008,
120.656001764659, 151.188708233622, 147.57731284482, 139.041287380379,
143.3093001126, 147.591419858058, 148.576345873186, 174.855146623166,
155.79913665079, 208.328524124275, 203.732202720345, 184.361991089498,
188.60563716249, 180.726229041467, 197.812386983587, 201.09547370068,
236.895225930229, 247.729412096636, 273.994105833378, 253.967276859112,
251.340807485438, 269.055368744501, 306.48255731936, 318.301669500894,
320.27152153115, 342.92481987909, 336.030337773195, 335.702029101486,
333.089666741049, 318.315776514131, 322.912097918061, 410.242204592731,
395.782516024285, 411.54133226633, 432.224778584015),
y = c(2.54132343897037, 2.50762114884014, 2.38923518377857, 2.41501285415006, 2.30416887155265,
2.23724762261166, 2.18569228186868, 2.17557253236737, 2.18588360051597,
2.00543990791553, 1.93068466383821, 1.86624048790948, 2.03906164282585,
2.10866135282887, 2.162794460609, 2.09835028468028, 2.19888319912909,
2.17288400190285, 2.03626234893394, 1.88406333031477, 1.76033051253161,
1.73197507512297, 1.99748507994933, 2.01552944920937, 2.18308430662406,
1.94850750624349, 1.84024129068323, 1.86601896105472, 1.88148556327762,
1.75259721142016, 1.75259721142016, 1.801574785126, 1.77321934771736,
1.78868594994025, 1.4973982747424, 1.47162060437091, 1.22684349926912,
1.43287354359376, 1.85047180361192, 1.61073946915705, 1.64167267360284,
1.56433966248837, 1.48937296095878, 1.25479616057821, 1.20066305279808,
1.04341926353198, 1.56670597207325, 1.41461771688145, 1.07693023501492,
1.07950800205207)),
.Names = c("x", "y"),
row.names = c(NA, -50L),
class = c("tbl_df", "tbl", "data.frame"))
# Create data frame from digitised data
bobo.dat <- Fig5 %>% # Bobo River
mutate(API = x) %>%
mutate(ILs = 10^y) %>% # ILs is on a log scale
mutate(ILs_mm = ILs/100*25.4) %>% # convert from points (inches/100) to mm
mutate(API_mm = API/100*25.4)
# Make a plot the initial loss, API relationship
API_ILplot <- bobo.dat %>%
ggplot(aes(x = API_mm, y = ILs_mm)) + geom_point() +
theme_bw() +
scale_x_continuous(name = 'API (mm)') +
scale_y_continuous(name = 'Initial loss (mm)') +
theme(
panel.background = element_rect(fill="gray98"),
axis.title.x = element_text(colour="grey20", size=16, margin=margin(20,0,0,0)),
axis.text.x = element_text(colour="grey20",size=14),
axis.title.y = element_text(colour="grey20",size=16, margin = margin(0,20,0,0)),
axis.text.y = element_text(colour="grey20",size=14),
legend.title = element_text(colour="grey20",size=12),
plot.margin = unit(c(0.5, 0.5, 1, 0.5), "cm")) # top, right, bottom, left
# Draw the plot
API_ILplot
# Check marginal histograms
#ggExtra::ggMarginal(API_ILplot, type = "histogram")
# Model
# From Cordery
# IL = ILmax(N)^API
# Preview model to check starting values are reasonable
nlstools::preview(ILs_mm ~ ILmax*N^API_mm, data = bobo.dat, start = list(ILmax = 80, N = 0.98))
# fit the model using NLS
bobo.mod <- nls(ILs_mm ~ ILmax*N^API_mm, data = bobo.dat, start = list(ILmax = 80, N = 0.98), trace = TRUE)
# add fitted model to plot
# geom_smooth can use nls and add the fitted model or use nlstools::plotfit
API_ILplot +
geom_smooth(method = "nls", formula = y ~ ILmax*a^x, method.args = list(start = list(ILmax = 84, a = 0.97)), se = FALSE)
# Check model
summary(bobo.mod)
nlstools::overview(bobo.mod)
#
# ------
# Formula: ILs_mm ~ ILmax * N^API_mm
#
# Parameters:
# Estimate Std. Error t value Pr(>|t|)
# ILmax 84.80480 6.13395 13.82 <2e-16 ***
# N 0.96886 0.00268 361.49 <2e-16 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 9.023 on 48 degrees of freedom
#
# Number of iterations to convergence: 7
# Achieved convergence tolerance: 1.921e-06
#
# ------
# Residual sum of squares: 3910
#
# ------
# t-based confidence interval:
# 2.5% 97.5%
# ILmax 72.4716594 97.1379372
# N 0.9634766 0.9742544
#
# ------
# Correlation matrix:
# ILmax N
# ILmax 1.0000000 -0.8074648
# N -0.8074648 1.0000000
#
# R-square
RSS <- sum(residuals(bobo.mod)^2)
TSS <- sum((bobo.dat$ILs_mm - mean(bobo.dat$ILs_mm))^2)
R.square <- 1 - (RSS/TSS)
R.square
#[1] 0.7934562
deviance(bobo.mod)
# [1] 3907.827
logLik(bobo.mod)
# 'log Lik.' -179.9148 (df=3)
coef(bobo.mod)
# Confidence intervals for parameters
confint(bobo.mod)
nlstools::confint2(bobo.mod, method = "asymptotic")
# 2.5 % 97.5 %
# ILmax 72.4716594 97.1379372
# N 0.9634766 0.9742544
nlstools::confint2(bobo.mod, method = "profile")
# 2.5% 97.5%
# ILmax 72.4065771 98.3413675
# N 0.9627527 0.9743995
# Bootstrap confidence intervals for parameters
boot.cl <- nlstools::nlsBoot(bobo.mod)
boot.cl$bootCI
# Median 2.5% 97.5%
# ILmax 84.7308584 73.086423 97.0838838
# N 0.9691428 0.963597 0.9741001
# Confidence region
plot(nlstools::nlsConfRegions(bobo.mod))
nlstools::nlsContourRSS(bobo.mod)
plot(nlsContourRSS(bobo.mod), nlev = 4)
# Check assumptions
test.nlsResiduals(nlsResiduals(bobo.mod))
# residuals
# 1. Constant variance
plot(nlsResiduals(bobo.mod)) # not too bad
# 2. Normal errors
#
plot(nlsResiduals(bobo.mod), which = 6)
# perhaps heavy tailed, could consider a transformation but passes normality test
plot(nlsResiduals(bobo.mod), which = 5) # looks ok
# 3. Independence
nlstools::test.nlsResiduals(nlsResiduals(bobo.mod)) # autocorrelated
plot(nlsResiduals(bobo.mod), which = 4)
forecast::Acf(resid(bobo.mod))
# This autocorrelation is just an artifact of the way that I've collected the data
# i.e. by digitizing it so its sorted from high IL to low IL
# We can test by randomising the input data and re-running the model
set.seed(2016)
bobo.dat.rand <- bobo.dat[sample(1:50) ,]
bobo.mod.rand <- nls(ILs_mm ~ ILmax*N^API_mm, data = bobo.dat.rand, start = list(ILmax = 80, N = 0.98), trace = TRUE)
test.nlsResiduals(nlsResiduals(bobo.mod.rand))
# passes teh runs test
# influential observations
plot(nlstools::nlsJack(bobo.mod))
summary(nlstools::nlsJack(bobo.mod))
# Bootstrap
plot(nlsBoot(bobo.mod))
plot(nlsBoot(bobo.mod), type = "boxplot")
#_________________________________________________________________________________________________________________
# Confidence intervals for predicted values
# It seems this is not straight forward
# https://rmazing.wordpress.com/2013/08/14/predictnls-part-1-monte-carlo-simulation-confidence-intervals-for-nls-models/
# http://www.inside-r.org/packages/cran/propagate/docs/predictNLS
# https://quantitativeconservationbiology.wordpress.com/2013/07/02/confidence-interval-for-a-model-fitted-with-nls-in-r/
# see http://www.leg.ufpr.br/~walmes/cursoR/ciaeear/as.lm.R
# https://quantitativeconservationbiology.wordpress.com/2013/07/02/confidence-interval-for-a-model-fitted-with-nls-in-r/
# http://thr3ads.net/r-help/2011/05/1053390-plotting-confidence-bands-from-predict.nls
# Using the as.lm approach
# from http://www.leg.ufpr.br/~walmes/cursoR/ciaeear/as.lm.R
# if object is an "nls" object then its often used like this:
# predict(as.lm(object), ...) where ... are any predict.lm args
# as.lm.nls effectively just does this:
# lm(lhs ~ gradient - 1, offset = fitted(object),
# list(gradient = object$m$gradient(), lhs = object$m$lhs()))
# so most of the code is just to get the names right.
as.lm <- function(object, ...) UseMethod("as.lm")
as.lm.nls <- function(object, ...) {
if (!inherits(object, "nls")) {
w <- paste("expected object of class nls but got object of class:",
paste(class(object), collapse = " "))
warning(w)
}
gradient <- object$m$gradient()
if (is.null(colnames(gradient))) {
colnames(gradient) <- names(object$m$getPars())
}
response.name <- if (length(formula(object)) == 2) "0" else
as.character(formula(object)[[2]])
lhs <- object$m$lhs()
L <- data.frame(lhs, gradient)
names(L)[1] <- response.name
fo <- sprintf("%s ~ %s - 1", response.name,
paste(colnames(gradient), collapse = "+"))
fo <- as.formula(fo, env = as.proto.list(L))
do.call("lm", list(fo, offset = substitute(fitted(object))))
}
# Calculate and add confidence intervals to plot
# Calculate prediction intervals for all the given data points
predCI <- predict(as.lm.nls(bobo.mod), interval = 'prediction', level = 0.95)
predCI
# Then we need to interpolate to make smooth contour intervals
x.seq <- seq(min(bobo.dat$API_mm),max(bobo.dat$API_mm), length.out = 100) # generate 100 points between the max and min of the x values
lwr <- approx(bobo.dat$API_mm, predCI [, 2], xout = x.seq)$y ## lower CI
upr <- approx(bobo.dat$API_mm, predCI[, 3], xout = x.seq)$y ## upper CI
# tidy into data frame
CL.as.lm <- data_frame(x.seq = x.seq, lwr = lwr, upr = upr)
# Add to plot
API_ILplot +
geom_smooth(method = "nls", formula = y ~ ILmax*a^x, method.args = list(start = list(ILmax = 80, a = 0.98)), se = FALSE) +
geom_line(data = CL.as.lm, aes(x = x.seq, y = upr), linetype = 2) +
geom_line(data = CL.as.lm, aes(x = x.seq, y = lwr), linetype = 2)
#_________________________________________________________________________________________
# Using the predictNLS function from the propagate package.
#help(predictNLS)
# Test for a particular value of API
my.newdata <- data.frame(API_mm = 30)
x <- propagate::predictNLS(bobo.mod, interval = "prediction", newdata = my.newdata, alpha = 0.05)
x
# $summary
# Prop.Mean.1 Prop.Mean.2 Prop.sd.1 Prop.sd.2 Prop.2.5% Prop.97.5% Sim.Mean Sim.sd Sim.Median Sim.MAD Sim.2.5% Sim.97.5%
# 1 32.8341 32.78426 1.616907 1.622614 14.35145 51.21707 32.784 1.621922 32.73954 1.613052 29.72054 36.07428
#
# $prop
# $prop[[1]]
# Results from error propagation:
# Mean.1 Mean.2 sd.1 sd.2 2.5% 97.5%
# 32.834103 32.784258 1.616907 1.622614 29.616476 35.967704
#
# Results from Monte Carlo simulation:
# Mean sd Median MAD 2.5% 97.5%
# 32.783996 1.621922 32.739536 1.613052 29.720536 36.074284
# As before, generate a sequence of x values for plotting
x.seq <- seq(min(bobo.dat$API_mm),max(bobo.dat$API_mm), length.out = 100) # generate 100 points between the max and min of the x values
my.newdata <- data.frame(API_mm = x.seq) # 100 API values where CL will be calculated
# calculate confidence limits as each x.seq
CL.predictNLS <- propagate::predictNLS(bobo.mod, interval = "prediction", newdata = my.newdata, alpha = 0.05)$summary
head(CL.predictNLS)
# add to plot
API_ILplot +
geom_smooth(method = "nls", formula = y ~ ILmax*a^x, method.args = list(start = list(ILmax = 80, a = 0.98)), se = FALSE) +
geom_line(data = CL.as.lm, aes(x = x.seq, y = upr), linetype = 1) +
geom_line(data = CL.as.lm, aes(x = x.seq, y = lwr), linetype = 1) +
geom_line(data = CL.predictNLS, aes(x = x.seq, y = `Prop.2.5%`), linetype = 2, colour = 'Blue') +
geom_line(data = CL.predictNLS, aes(x = x.seq, y = `Prop.97.5%`), linetype = 2, colour = 'Blue') +
geom_line(data = CL.predictNLS, aes(x = x.seq, y = `Sim.2.5%`), linetype = 2, colour = 'Green') +
geom_line(data = CL.predictNLS, aes(x = x.seq, y = `Sim.97.5%`), linetype = 2, colour = 'Green')
# in this case, the prediction intervals are pretty much the same
# The Sim intervals are confidence intervals not prediction intervals
# Plot for blog
API_ILplot +
geom_smooth(method = "nls", formula = y ~ ILmax*a^x, method.args = list(start = list(ILmax = 80, a = 0.98)), se = FALSE) +
geom_line(data = CL.predictNLS, aes(x = x.seq, y = `Prop.2.5%`), linetype = 2, colour = 'Blue') +
geom_line(data = CL.predictNLS, aes(x = x.seq, y = `Prop.97.5%`), linetype = 2, colour = 'Blue') +
geom_hline(yintercept = 0)
# When does the lower prediction interval become negative
x.seq[which.min(abs(CL.predictNLS$`Prop.2.5%`))]
# [1] 47.9853
#_________________________________________________________________________
# Other models
# Log-Log model
bobo.mod.log <- lm(log(ILs_mm) ~ log(API_mm), data = bobo.dat)
bobo.mod.log
summary(bobo.mod.log)
# Check residuals
plot(bobo.mod.log)
# Comparison of Log-Log model and Cordery's model
resid(bobo.mod.log)
Acf(resid(bobo.mod.log)) # some autocorrelation
# Compare models with AIC
AIC(bobo.mod.log)
# [1] 77.61382
AIC(bobo.mod)
# [1] 365.8295
# So the log-log model is better (at least according to the AIC)
# Calculate prediction intervals and plot
x.seq <- seq(min(bobo.dat$API_mm),max(bobo.dat$API_mm), length.out = 100) # generate 100 points between the max and min of the x values
my.newdata <- data.frame(API_mm = x.seq) # 100 API values where CL will be calculated
CL.log_log <- as.data.frame(predict(bobo.mod.log, newdata = my.newdata, interval = 'prediction' ))
API_ILplot +
geom_line(data = CL.log_log, aes(x = x.seq, y = exp(fit)), linetype = 1, colour = 'Blue') +
geom_line(data = CL.log_log, aes(x = x.seq, y = exp(lwr)), linetype = 2, colour = 'Blue') +
geom_line(data = CL.log_log, aes(x = x.seq, y = exp(upr)), linetype = 2, colour = 'Blue') +
scale_y_continuous(name = 'Initial loss (mm)', limits = c(1, 100))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment