floybix/00_init.R

## 00_init.R

## groundwater database analysis scripts
## Felix Andrews <felix@nfrac.org> 2010-08-01

library(RODBC)
library(zoo)
library(lattice)
library(latticeExtra)
library(sp)
library(rgdal)
library(mgcv)
library(plyr) ## for progress bars
library(maps) ## for world.cities

stopifnot(packageDescription("zoo")$Version >= "1.7")
stopifnot(packageDescription("lattice")$Version >= "0.19")
stopifnot(packageDescription("latticeExtra")$Version >= "0.6-14")

lattice.options(default.theme = custom.theme.2())

## use Windows progress bar. otherwise change this to progress_text or progress_tk
myprogress <- function(title = "Progress") progress_win(title)

setwd("X:/Projects/BOMGW")
conn <- odbcConnectAccess("MdbCsv/water.mdb")
sqlTables(conn)
locs <- sqlFetch(conn, "Locations")
locs$DrilledDpth <- as.numeric(gsub(",", "", locs$DrilledDpth))

gmus <- readOGR("agmu_r9nnd_00311a01e_sgeo___", "gmushape")
rownames(gmus@data) <- make.unique(as.character(gmus@data$GMU_NAME))

#rivers <- readOGR("river_reaches/arc_reach_shape", "arc_reach")
#rivers <- rivers[,c("STATE", "BASIN", "A", "M")]
#save(rivers, file = "rivers.Rdata")
load("rivers.Rdata")

data(world.cities)
cities.all <- subset(world.cities, country.etc == "Australia")
cities.ok <- subset(world.cities, country.etc == "Australia" & (pop >= 50000) |
                 (name %in% c("Darwin", "Alice Springs", "Kalgoolie", "Broome", "Walgett",
                              "Mount Isa", "Esperance", "Broken Hill", "Coober Pedy",
                              "Bendigo", "Ballarat", "Sale", "Mildura"))) # "Echuca-Moama"

source("01_script.R")
source("02_timeseries.R")
source("03_plot_all_sites.R")
source("04_plot_distributions.R")
source("05_plot_spatial.R")

## now you are ready to run "0_nsw.R" etc

## 01_script.R

## groundwater database analysis scripts
## Felix Andrews <felix@nfrac.org> 2010-08-01

do_groundwater_analysis <-
    function(infolist, state = "VIC", ident = state,
             refyears = c(1985, 1990, 1995), now = 2007,
             gmus.state = gmus[gmus$STATE == state,],
             rivers.state = rivers[rivers$STATE == state,],
             do.what = c("state", "gmus"))
{
    if (!file.exists(ident)) dir.create(ident)

    message("generating distribution plots")
    png(sprintf("%s/%s_distributions_%%01d.png", ident, ident),
        width = 3000, height = 1500, res = 300)
    print(plot_distributions(infolist, ident = ident, now = now))
    dev.off()

    ## write out differences from current as shapefile (points)
    message("writing points to shapefile")
    deltas <- as.data.frame(t(infolist$deltas))
    deltas$stationID <- rownames(deltas)
    pt.deltas <- merge(locs, deltas)
    pt.deltas <- subset(pt.deltas, Lng > 120)
    ## need to keep column names to at most 10 characters for writeOGR
    colnames(pt.deltas) <- substring(colnames(pt.deltas), first = 1, last = 10)
    pt.deltas <- SpatialPointsDataFrame(coords = pt.deltas[,c("Lng","Lat")],
                                        data = pt.deltas)
    writeOGR(pt.deltas,
             sprintf("%s/%s_gw_trends", ident, ident),
             "sites_deltas",
             driver = "ESRI Shapefile")

    if ("state" %in% do.what) {
        for (refyear in refyears) {
            message("aggregating to GMUs for reference year ", refyear)
            ## calculate quantiles of changes over the set of bores in each GMU
            qqtmp <-
                overlayEach(pt.deltas[,toString(refyear)], gmus.state,
                            fn = function(x) {
                                qq <- quantile(x[,1], c(0.5, 0.05, 0.01), na.rm = TRUE)
                                names(qq) <- c("median", "worst_5pc", "worst_1pc") ## max 10 chars!
                                qq[["n_values"]] <- sum(!is.na(x[,1]))
                                qq
                            },
                            out.n = 4)
            ii.na <- is.na(qqtmp[,"n_values"])
            qqtmp[ii.na, "n_values"] <- 0

            ## write out aggregated stats as shapefile (polygons)
            gmus.ref <- gmus.state
            gmus.ref@data <- cbind(qqtmp, gmus.ref@data)

            writeOGR(gmus.ref,
                     sprintf("%s/%s_gw_trends", ident, ident),
                     sprintf("gmus_%i", refyear),
                     driver = "ESRI Shapefile")

            pt.deltas.ref <- pt.deltas[,c(refyear, "stationID", "longName")]
            colnames(pt.deltas.ref@data)[1] <- "value"

            ## generate maps (quantiles of changes in GMUs)
            foo <-
                gmuLevelPlot(gmus.ref[,1:4], pt.deltas.ref, refyear = refyear,
                             xlab.top = ident)

            ##pdf(sprintf("%s/%s_deltas%i_gmus.pdf", ident, ident, refyear),
            ##    paper = "a4r", width = 18, height = 10)
            ##print(foo)
            ##dev.off()

            png(sprintf("%s/%s_deltas%i_gmus.png", ident, ident, refyear),
                width = 6000, height = 3000, res = 300)
            print(foo)
            dev.off()
        }
    }

    if ("gmus" %in% do.what) {
        ## generate detailed results for each GMU
        l_ply(seq(1, nrow(gmus.state)),
              .progress = myprogress("generating detailed results for each GMU"),
              function (i)
          {
              gmuname <- as.character(gmus.state@data[i, "GMU_NAME"])
              gmuno <- as.character(gmus.state@data[i, "GMU_NO"])
              message(gmuno, gmuname)
              gmunameok <- make.names(substr(gmuname, 1, 15))
              nameok <- paste(sprintf("%s/%s", ident, ident), gmuno, gmunameok, sep = "_")
              ##pdf(paste(nameok, ".pdf", sep = ""), paper = "a4r", width = 18, height = 10)
              png(paste(nameok, "%01d.png", sep = ""), width = 3000, height = 1500, res = 300)

              anyOK <- FALSE
              for (refyear in refyears) {

                  pt.deltas.ref <- pt.deltas[,c(toString(refyear), "stationID", "longName")]
                  colnames(pt.deltas.ref@data)[1] <- "value"

                  foo <-
                      tryCatch(gmuLevelPlotGAM(gmus.state[,1], pt.deltas.ref, subset = i,
                                          refyear = refyear, cities = cities.all),
                               error = force)
                  if (inherits(foo, "error")) {
                      message(foo$message)
                      next
                  } else {
                      if (anyOK == FALSE) ## only once:
                          print(foo$context)
                      anyOK <- TRUE
                  }

                  print(foo$levels +
                        layer(sp.lines(rivers.state, lwd = 2, col = "blue", alpha = 0.5),
                              data = environment())
                        )
              }

              if (anyOK) {
                  foo <-
                      tryCatch(gmuHorizonplot(gmus.state[,1], pt.deltas, subset = i,
                                              levelzoo = infolist$levels),
                               error = force)
                  if (inherits(foo, "error")) {
                      message(foo$message)
                  } else {
                      print(foo)

                      print(gmuDistributionTrend(gmus.state[,1], pt.deltas, subset = i,
                                                 deltas = infolist$deltas))
                  }
              }

              dev.off()
          })
    }
}

## 02_timeseries.R

## merge a list of zoo objects (and put all on the same time scale).
## return the result as a merged zoo matrix, indexed by seq(start, end, by = by)
## This is similar to do.call("merge", tslist) but that can fail due to memory usage.
merge_zoolist <- function(tslist, ..., by = 1L)
{
    start <- do.call(min, lapply(tslist, start))
    end <- do.call(max, lapply(tslist, end))
    newindex <- seq(start, end, by = by)
    ## put each series on the same index and extract core data
    coreagg <- llply(tslist, function(z) {
        if (length(z) == 0)
            z <- zoo(1) ## need this, otherwise merged series has 0 columns
        coredata(merge(z, zoo(, newindex), all = c(FALSE, TRUE)))
    }, ...)
    ans <- matrix(unlist(coreagg, use.names = FALSE),
                  ncol = length(tslist), nrow = length(newindex))
    colnames(ans) <- names(tslist)
    zoo(ans, newindex)
}


gw_timeseries_process <- function(tslist, now)
{
    ans <- list()

    message("Processing time series...")
    ntasks <- 4
    task.i <- 0

    ## aggregate to regular monthly (median) series
    task.i <- task.i + 1
    message("Task ", task.i, " out of ", ntasks)
    ans$monthly <-
        llply(tslist,
              .progress = myprogress("Aggregating (by medians) to regular monthly series"),
              function(z)
          {
              z <- aggregate(z, as.yearmon,
                             function(x) median(coredata(x), na.rm = TRUE))
              t <- as.yearmon(seq(as.Date(start(z)), as.Date(end(z)), by = "months"))
              merge(z, zoo(,t), all = c(FALSE, TRUE))
          })

    ## calculate number of (monthly) observations in each calendar year for each site
    task.i <- task.i + 1
    message("Task ", task.i, " out of ", ntasks)
    nperyear <- llply(ans$monthly,
                      .progress = myprogress("Counting number of observations each year"),
                      function(z)
                  {
                      aggregate(z, as.integer, function(z) sum(complete.cases(z)))
                  })
    nperyear <-
        merge_zoolist(nperyear, .progress = myprogress("Merging observations per year to matrix"))
    coredata(nperyear)[coredata(is.na(nperyear))] <- 0
    ans$nperyear <- nperyear

    ## calculate seasonal fluctuations
    task.i <- task.i + 1
    message("Task ", task.i, " out of ", ntasks)
    ans$fluct <-
        unlist(llply(ans$monthly,
                     .progress = myprogress("Calculating seasonal fluctuations"),
                     function(z)
                 {
                     if (sum(!is.na(z)) < 10) return(NA)
                     ## first, remove a linear trend
#                     y <- coredata(z)
#                     x <- as.numeric(index(z))
#                     coredata(z) <- residuals(MASS::rlm(y ~ x, na.action = na.exclude))
                     median(aggregate(z, floor, function(z) {
                         tmp <- diff(range(z))
                         if (identical(tmp, 0)) NA else tmp
                     }), na.rm = TRUE)
                 }))

    ## calculate annual values (median of monthly values).
    ## BUT for years with only a single (monthly) value,
    ## replace with the median over at least 3 values in a 5-year window.
    ## -- to avoid influence of errors in single data points
    task.i <- task.i + 1
    message("Task ", task.i, " out of ", ntasks)
    ans$annual <-
        llply(ans$monthly,
              .progress = myprogress("Aggregating (by medians) to annual values; rolling median"),
              function(z)
          {
              ## 1st step: calculate medians for each year
              ## Note floor() applied to 'yearmon' gives the year (for aggregation)
              zann <-
                  aggregate(z, floor, function(x) {
                      median(coredata(x), na.rm = TRUE)
                  })

              znum <-
                  aggregate(z, floor, function(x) {
                      sum(complete.cases(x))
                  })

              ## 2nd step: rolling median over chunks of 5 data-years
              tmp <- na.omit(zann)
              width.odd <- length(tmp) - (1 - length(tmp)%%2)
              zrollmed <-
                  rollapply(tmp, width = min(5, width.odd),
                            median, na.pad = TRUE)
              zrollmed <- merge(zrollmed, zoo(,time(zann)))

              ## 3rd step: use the rolling medians to fill in
              ## years which have fewer than 3 monthly observations.
              zann[znum < 3] <- zrollmed[znum < 3]
              ## locf = last observation carry forward
              na.locf(na.trim(zann, sides = "left"))
          })
    ans$levels <- merge_zoolist(ans$annual,
                                .progress = myprogress("Merging annual levels to matrix"))
    index(ans$levels) <- as.integer(index(ans$levels))

    ## calculate time series of changes from the "now" year
    ## create a matrix of the same size as $levels with just the "now" data
    levs.nowmatrix <- na.locf(merge(ans$levels[I(now),], zoo(,time(ans$levels))),
                              fromLast = TRUE, na.rm = FALSE)
    ans$deltas <- window(ans$levels - levs.nowmatrix, end = now)

    ans
}

## 03_plot_all_sites.R

plot_all_timeseries <- function(tslist, ..., ident,
                                nperpage = 20, y.same = NA)
{
    label <- toString(deparse(substitute(tslist)), width = 20)
    ## generate pdf with all time series
    pdf(paste(ident, "_timeseries.pdf", sep = ""), paper = "a4r", width = 10)
    l_ply(1:ceiling(length(tslist)/nperpage),
          .progress = myprogress(paste("Plotting", label, "time series to PDF")),
          function(i)
      {
          ii <- (i-1)*nperpage + 1:nperpage
          ii <- ii[ii <= length(tslist)]
          foo <-
              xyplot(tslist[ii], ..., y.same = y.same)
          try(print(foo))
      })
    dev.off()
}

plot_all_timecoverage <- function(tslist, ..., ident, nperpage = 50)
{
    label <- toString(deparse(substitute(tslist)), width = 20)
    ## generate pdf with all time series coverage
    pdf(paste(ident, "_timecoverage.pdf", sep = ""), paper = "a4r", width = 10)
    l_ply(1:ceiling(length(tslist)/nperpage),
          .progress = myprogress(paste("Plotting", label, "time coverage to PDF")),
          function(i)
      {
          ii <- (i-1)*nperpage + 1:nperpage
          ii <- ii[ii <= length(tslist)]
          foo <-
              xyplot(unname(tslist[ii]), layout = c(1,NA),
                     strip = FALSE, strip.left = FALSE,
                     ylab.right = list(names(tslist)[ii], cex = 0.4, rot = 0),
                     panel = panel.xblocks,
                     xlab = NULL,
                     xscale.components = xscale.components.subticks,
                     x.same = TRUE,
                     scales = list(y = list(draw = FALSE),
                     x = list(axs = "i", tick.number = 8)))
          foo <- update(foo, ...)
          try(print(foo))
      })
    dev.off()
}

## 04_plot_distributions.R

plot_distributions <-
    function(tsinfo, ident, now, include.lines = FALSE)
{
    ans <- list()

    ## use all years with at least one observation
    tsok <- (tsinfo$nperyear >= 1)

    ## plot distribution of available annual values over sites
    ts.ann.n <- sapply(tsinfo$annual, function(x) sum(!is.na(coredata(x))))
    ans$npersite <-
    ecdfplot(ts.ann.n, grid = TRUE,
             xscale.components = xscale.components.log10ticks,
             main = "Distribution of number of annual values",
             ylab = "Empirical CDF (cumulative fraction of sites)",
             xlab = "number of years available at each site")

    ## plot number of sites with at least 1 monthly value, each year
    ans$sitesperyear <-
    xyplot(zoo(apply(tsok, 1, sum), index(tsok)),
           panel = panel.barchart, horizontal = FALSE, origin = 0,
           ylab = "number of sites with usable annual estimates", xlab = NULL,
           xscale.components = xscale.components.subticks) +
               layer_(panel.grid(h = -1, v = 0))

    ## how many sites have data in BOTH a 1-YEAR reference period and the most recent 1-YEAR period?
    ## (for several possible reference periods)
    isok.1year.now <- tsok[I(now),]
    isok.1year.each <- tsok
    isok.1year.each[,!isok.1year.now] <- FALSE
    ans$oksitesperyear <-
    barchart(apply(isok.1year.each, 1, sum) ~ format(time(isok.1year.each)),
             ylab = sprintf("number of sites with 1 year data\nin BOTH reference and current (%i)", now),
             origin = 0, scales = list(x = list(rot = 90))) +
                 layer_(panel.grid(h = -1, v = 0))


    ## how many sites have data in BOTH a 2-YEAR reference period and the most recent 2-YEAR period?
    ## (for several possible reference periods)
    isok.2year.now <- aggregate(tsok[I(now-(1:0)),], rep(now-1, 2), all)
    isok.2year.each <- rollapply(tsok, width = 2, all, align = "right")
    isok.2year.each[,!isok.2year.now] <- FALSE
    ans$oksitesper2years <-
    barchart(apply(isok.2year.each, 1, sum) ~ format(time(isok.2year.each)),
             ylab = sprintf("number of sites with 2 years data\nin BOTH reference and current (%s)",
                            paste(now-(1:0), collapse = "-")),
             origin = 0, scales = list(x = list(rot = 90))) +
                 layer_(panel.grid(h = -1, v = 0))

    ## plot distribution of changes over time
    ans$deltas.box <-
        plot_distribution_trend(tsinfo$deltas)

    if (include.lines) {
        ans$deltas.lines <-
            plot_distribution_trend(tsinfo$deltas, with.lines = TRUE)
    }

    ans
}


plot_distribution_trend <-
    function(deltas, ## e.g. vicinfo$deltas
             ...,
             with.lines = FALSE)
{
    ## plot distribution of changes over time
    if (with.lines == FALSE) {
        ydat <- stack(as.data.frame(t(deltas)))
        ## reverse time (to show "years ago")
        yearlabels <- time(deltas)
        ydat$ind <- factor(ydat$ind, levels = rev(time(deltas)), labels = rev(yearlabels))

        ans <-
            bwplot(values ~ ind, ydat, ylim = c(-20,20),
                   pch = "|", varwidth = TRUE, par.settings = simpleTheme(cex = 0.7),
                   ylab = sprintf("change in water level from current (%s)", end(deltas)),
                   xlab = "reference year",
                   scales = list(x = list(rot = 90), y = list(tick.number = 9)))
    } else {
        ## show individual sites as lines
        ans <-
            xyplot(deltas, screens = 1,
                   xlim = rev(range(time(deltas))), alpha = 0.2,
                   ylab = sprintf("change in water level from current (%s)", end(deltas)),
                   xlab = "reference year")
    }

    ans <- ans +
        layer_(panel.grid(-1,-1, x = x, y = y), panel.abline(h=0)) +
        layer(panel.points(1, 0, pch = 16, col = "black"),
              panel.arrows(1, 2.5, 1, 0, length = 0.1),
              panel.text(1, 3, '"current" (comparison point)',
                         srt = 90, pos = 4, offset = 0, cex = 0.7))
    ans
}

## 05_plot_spatial.R

## aggregate points (sites) into groundwater management units.
## NOTE can not use overlay() directly because it does not account for overlapping polygons!
## So loop through and apply overlay() to each "Polygons" element:
overlayEach <- function(spatialpoints, spatialpolys, ..., out.n = 1, fill = NA_real_)
{
    foo <-
        ldply(spatialpolys@polygons,
              .progress = myprogress("Spatial aggregation"),
              function(p, ...) {
                  ans <- unlist(overlay(spatialpoints, SpatialPolygons(list(p)), ...))
                  if (length(ans) == 0) rep(fill, length = out.n) else ans
              }, ...)
    rownames(foo) <- rownames(spatialpolys@data)
    foo
}

## plot a SpatialPolygonsDataFrame and associated SpatialPointsDataFrame
## or a given subset (typically only 1) of the polygons
## (GMU = Groundwater Management Unit)
gmuLevelPlot <-
    function(polydf, pointsdf, subset = NULL, ...,
             colrange = c(-15, 15),
             xlim = extendrange(bbox(subpolydf)[1, ], f = 0.04),
             ylim = extendrange(bbox(subpolydf)[2, ], f = 0.04),
             main = paste("Change in level (m) since", refyear), refyear,
             xlab.top = toString(rownames(polydf@data)[subset]),
             cities = cities.ok,
             par.settings = custom.theme.2())
{
    ## limit range of data, for plotting
    subcolrange <- extendrange(colrange, f = -0.01)
    polydf@data[] <- lapply(polydf@data, function(x)
                            pmin(pmax(x, min(subcolrange)), max(subcolrange)))
    colnames(pointsdf@data)[1] <- "value"
    pointsdf@data$value <-
        pmin(pmax(pointsdf@data$value, min(subcolrange)), max(subcolrange))

    subpolydf <- polydf
    subpointsdf <- pointsdf
    if (!is.null(subset)) {
        subpolydf <- polydf[subset,]
        inside <- !is.na(overlay(pointsdf, subpolydf))
        subpointsdf <- pointsdf[inside,]
    }

    result <- list()

    if (!is.null(subset)) {
        result$context <-
            spplot(polydf[,1], main = main, colorkey = FALSE,
                   col = "grey", col.regions = NA) +
                layer(panel.rect(min(xlim), min(ylim), max(xlim), max(ylim),
                                 col = "yellow", alpha = 0.5),
                      sp.polygons(subpolydf),
                      data = environment())
    }

    result$levels <-
    spplot(subpolydf, ...,
           at = seq(min(colrange), max(colrange), by = 1),
           xlim = xlim, ylim = ylim,
           lty = 0, as.table = TRUE, par.settings = par.settings,
           main = main, xlab.top = xlab.top) +
        layer_(panel.fill("grey90"), panel.grid(100,100, col = "white")) +
        layer(panel.levelplot.points(x = Lng, y = Lat, z = value,
                                     subscripts = TRUE, ..., cex = 0.5,
                                     col.symbol = "transparent"),
              ## show translucent circles for all points (crosses for NAs)
              grid.points(x = Lng, y = Lat,
                          pch = ifelse(is.na(value), 4, 1),
                          gp = gpar(cex = 1, alpha = 0.1, lwd = 0.1)),
              data = as.data.frame(subpointsdf),
              packets = 1) +
        layer(panel.points(long, lat, pch = 15, cex = 0.6, col = "black"),
              panel.text(long, lat, name, pos = 1, cex = 0.6), data = cities) +
        layer(sp.polygons(subpolydf, lwd = 0.5, alpha = 0.5), data = environment())

    result
}

## take one polygon ('subset') from the SpatialPolygonsDataFrame
## (GMU = Groundwater Management Unit)
## with associated points from the given SpatialPointsDataFrame
## and generate smoothed surface over the points using gam().
## also returns a context plot of the 'subset' polygon.
gmuLevelPlotGAM <-
    function(polydf, pointsdf, subset = NULL, ...,
             minimum.sites = 20,
             colrange = c(-15, 15),
             xlim = extendrange(bbox(subpolydf)[1, ], f = 0.04),
             ylim = extendrange(bbox(subpolydf)[2, ], f = 0.04),
             xlab.top = paste("Change in level (m) since", refyear), refyear,
             main = toString(rownames(polydf@data)[subset]),
             cities = cities.ok,
             par.settings = custom.theme.2())
{
    stopifnot("value" %in% colnames(pointsdf@data))

    subpolydf <- polydf
    subpointsdf <- pointsdf
    if (!is.null(subset)) {
        subpolydf <- polydf[subset,]
        inside <- !is.na(overlay(pointsdf, subpolydf))
        subpointsdf <- pointsdf[inside,]
    }
    sitesok <- !is.na(subpointsdf@data$value)
    if (length(sitesok) < minimum.sites) {
        stop("number of sites less than ", minimum.sites)
    }
    if (sum(sitesok) == 0) {
        stop("no active sites (no non-missing values)")
    }

    ## limit range of data, for plotting
    subcolrange <- extendrange(colrange, f = -0.01)
    subpolydf@data[] <- lapply(subpolydf@data, function(x)
                               pmin(pmax(x, min(subcolrange)), max(subcolrange)))
    subpointsdf@data$value <-
        pmin(pmax(subpointsdf@data$value, min(subcolrange)), max(subcolrange))

    result <- list()

    if (!is.null(subset)) {
        result$context <-
            spplot(polydf[,1], main = main, colorkey = FALSE,
                   col = "grey", col.regions = NA) +
                layer(panel.rect(min(xlim), min(ylim), max(xlim), max(ylim),
                                 col = "yellow", alpha = 0.5),
                      sp.polygons(subpolydf),
                      data = environment())
    }

    ## fit a GAM and generate predictions on a regular grid inside the polygon
    samp <- spsample(subpolydf, n = 200 * 200, type = "regular")

    library(mgcv)
    mod <- try(gam(value ~ s(Lng,Lat), data = as.data.frame(subpointsdf)),
               silent = TRUE)
    if (inherits(mod, "try-error")) {
        ## can't fit a GAM, so just fill with a constant
        pred <- rep(median(subpointsdf@data$value, na.rm = TRUE), nrow(as.data.frame(samp)))
    } else {
        pred <- as.vector(predict(mod, with(as.data.frame(samp), list(Lng = x1, Lat = x2))))
    }

    result$levels <-
    spplot(subpolydf, ...,
           at = seq(min(colrange), max(colrange), by = 1),
           xlim = xlim, ylim = ylim,
           par.settings = par.settings,
           panel = function(...) NULL,
           main = main, xlab.top = xlab.top) +
        layer_(panel.fill("grey90"), panel.grid(100, 100, col = "white")) +
        layer(panel.levelplot(x = x1, y = x2, z = pred, subscripts = TRUE, ...),
              data = c(list(pred = pred), as.data.frame(samp))) +
        layer(panel.levelplot.points(x = Lng, y = Lat, z = value,
                                     subscripts = TRUE, ..., cex = 1,
                                     col.symbol = "transparent"),
              ## show translucent circles for all points (crosses for NAs)
              grid.points(x = Lng, y = Lat,
                          pch = ifelse(is.na(value), 4, 1),
                          gp = gpar(cex = 1, alpha = 0.3, lwd = 0.2)),
              data = as.data.frame(subpointsdf),
              packets = 1) +
        layer(sp.polygons(subpolydf, lwd = 0.5, alpha = 0.5), data = environment())

    result
}

## horizonplot of time series (levelzoo)
## using all sites within the given polydf[subset,]
gmuHorizonplot <-
    function(polydf, pointsdf, subset = NULL, ...,
             levelzoo, ## e.g. vicinfo$levels
             horizonscale = 5,
             origin = function(y) mean(range(y, na.rm = TRUE)))
{
    subpolydf <- polydf
    subpointsdf <- pointsdf
    if (!is.null(subset)) {
        subpolydf <- polydf[subset,]
        inside <- !is.na(overlay(pointsdf, subpolydf))
        subpointsdf <- pointsdf[inside,]
    }
    if (nrow(subpointsdf) == 0)
        stop("no values")

    stopifnot(c("stationID", "longName") %in% colnames(pointsdf@data))
    siteids <- as.character(subpointsdf@data$stationID)
    sitenames <- as.character(subpointsdf@data$longName)
    sitenames <- substr(sitenames, 1, 18)
    ord <- order(siteids)
    siteids <- siteids[ord]
    sitenames <- sitenames[ord]
    result <-
        horizonplot(0 - levelzoo[,siteids], strip.left = FALSE,
                    horizonscale = horizonscale,
                    origin = origin,
                    colorkey = list(space = "left"), layout = c(1, NA),
                    scales = list(x = list(axs = "i")), xlab = NULL,
                    par.settings = list(axis.line = list(col = "gray50")),
                    ylab = list(rev(siteids), rot = 0, cex = 0.3),
                    ylab.right = list(sitenames, rot = 0, cex = 0.3))

    update(result, ...)
}

## boxplots of distributions of changes over time (deltas)
## using all sites within the given polydf[subset,]
gmuDistributionTrend <-
        function(polydf, pointsdf, subset = NULL, ...,
                 deltas) ## e.g. vicinfo$deltas
{
    subpolydf <- polydf
    subpointsdf <- pointsdf
    if (!is.null(subset)) {
        subpolydf <- polydf[subset,]
        inside <- !is.na(overlay(pointsdf, subpolydf))
        subpointsdf <- pointsdf[inside,]
    }
    if (nrow(subpointsdf) == 0)
        stop("no values")

    stopifnot(c("stationID", "longName") %in% colnames(pointsdf@data))
    siteids <- as.character(subpointsdf@data$stationID)
    sitenames <- as.character(subpointsdf@data$longName)
    sitenames <- substr(sitenames, 1, 18)
    ord <- order(siteids)
    siteids <- siteids[ord]
    sitenames <- sitenames[ord]
    plot_distribution_trend(deltas[,siteids], ...)
}

## 0_nsw.R
## process the data for NSW

## TODO: reading in the whole table may not be feasible on some machines.
## It would probably be better to read in data for each site using SQL?
nsw <- sqlFetch(conn, "NSW")

## convert to zoo and remove obvious bad values
nswlist <-
    llply(split(1:nrow(nsw), nsw$stationID),
          .progress = myprogress("Splitting by site and converting to zoo objects"),
          function(ii)
      {
          x <- nsw[ii,]
          ## remove zero values (from visual inspection this is necessary):
          x <- subset(x, StandingWaterLevel != 0.0)
          ## TODO: negative values?
          z <- suppressWarnings(zoo(x$StandingWaterLevel, x$observTime))
      })

rm(nsw)

## remove series without any observations
nswlist <- nswlist[sapply(nswlist, function(x) sum(!is.na(coredata(x)))) > 0]

## remove all discontinued sites (those with no data after 2006)
nswlist <- nswlist[sapply(nswlist, end) >= as.POSIXct("2006-01-01")]

## distribution of start times
as.yearmon(quantile(do.call("c", lapply(nswlist, start))))
## distribution of end times
as.yearmon(quantile(do.call("c", lapply(nswlist, end))))

nswinfo <- gw_timeseries_process(nswlist, now = 2007)
save(nswinfo, file = "nswinfo.Rdata")

#load("nswinfo.Rdata")

## summary of changes each year (from current)
#summary(t(nswinfo$deltas))

## plot up some samples of the monthly + annual series, interactively
devAskNewPage(TRUE)
#pdf("nsw_samples.pdf", paper = "a4r", width = 18, height = 10)
for (i in head(seq(0, length(nswinfo$monthly), length = 10), -1)) {
    print(update(xyplot(nswinfo$monthly[1:20 + i], type = "b", cex = 0.7),
                 scales = list(y = "sliced", x = "same")) +
          xyplot(nswinfo$annual[1:20 + i], type = "s", col = "red", lwd = 2))
}
#dev.off()
devAskNewPage(FALSE)

plot_all_timeseries(nswinfo$monthly, ident = "nsw_monthly",
                    x.same = TRUE)

plot_all_timeseries(nswinfo$annual, ident = "nsw_annual", type = "s",
                    xlim = extendrange(c(1970,2010)))

## slow!
plot_all_timecoverage(nswinfo$monthly, ident = "nsw",
                      xlim = c(1970,2010))

## the real stuff:
do_groundwater_analysis(nswinfo, state = "NSW")

	## groundwater database analysis scripts
	## Felix Andrews <felix@nfrac.org> 2010-08-01

	library(RODBC)
	library(zoo)
	library(lattice)
	library(latticeExtra)
	library(sp)
	library(rgdal)
	library(mgcv)
	library(plyr) ## for progress bars
	library(maps) ## for world.cities

	stopifnot(packageDescription("zoo")$Version >= "1.7")
	stopifnot(packageDescription("lattice")$Version >= "0.19")
	stopifnot(packageDescription("latticeExtra")$Version >= "0.6-14")

	lattice.options(default.theme = custom.theme.2())

	## use Windows progress bar. otherwise change this to progress_text or progress_tk
	myprogress <- function(title = "Progress") progress_win(title)

	setwd("X:/Projects/BOMGW")
	conn <- odbcConnectAccess("MdbCsv/water.mdb")
	sqlTables(conn)
	locs <- sqlFetch(conn, "Locations")
	locs$DrilledDpth <- as.numeric(gsub(",", "", locs$DrilledDpth))

	gmus <- readOGR("agmu_r9nnd_00311a01e_sgeo___", "gmushape")
	rownames(gmus@data) <- make.unique(as.character(gmus@data$GMU_NAME))

	#rivers <- readOGR("river_reaches/arc_reach_shape", "arc_reach")
	#rivers <- rivers[,c("STATE", "BASIN", "A", "M")]
	#save(rivers, file = "rivers.Rdata")
	load("rivers.Rdata")

	data(world.cities)
	cities.all <- subset(world.cities, country.etc == "Australia")
	cities.ok <- subset(world.cities, country.etc == "Australia" & (pop >= 50000) \|
	(name %in% c("Darwin", "Alice Springs", "Kalgoolie", "Broome", "Walgett",
	"Mount Isa", "Esperance", "Broken Hill", "Coober Pedy",
	"Bendigo", "Ballarat", "Sale", "Mildura"))) # "Echuca-Moama"

	source("01_script.R")
	source("02_timeseries.R")
	source("03_plot_all_sites.R")
	source("04_plot_distributions.R")
	source("05_plot_spatial.R")

	## now you are ready to run "0_nsw.R" etc

	## groundwater database analysis scripts
	## Felix Andrews <felix@nfrac.org> 2010-08-01

	do_groundwater_analysis <-
	function(infolist, state = "VIC", ident = state,
	refyears = c(1985, 1990, 1995), now = 2007,
	gmus.state = gmus[gmus$STATE == state,],
	rivers.state = rivers[rivers$STATE == state,],
	do.what = c("state", "gmus"))
	{
	if (!file.exists(ident)) dir.create(ident)

	message("generating distribution plots")
	png(sprintf("%s/%s_distributions_%%01d.png", ident, ident),
	width = 3000, height = 1500, res = 300)
	print(plot_distributions(infolist, ident = ident, now = now))
	dev.off()

	## write out differences from current as shapefile (points)
	message("writing points to shapefile")
	deltas <- as.data.frame(t(infolist$deltas))
	deltas$stationID <- rownames(deltas)
	pt.deltas <- merge(locs, deltas)
	pt.deltas <- subset(pt.deltas, Lng > 120)
	## need to keep column names to at most 10 characters for writeOGR
	colnames(pt.deltas) <- substring(colnames(pt.deltas), first = 1, last = 10)
	pt.deltas <- SpatialPointsDataFrame(coords = pt.deltas[,c("Lng","Lat")],
	data = pt.deltas)
	writeOGR(pt.deltas,
	sprintf("%s/%s_gw_trends", ident, ident),
	"sites_deltas",
	driver = "ESRI Shapefile")

	if ("state" %in% do.what) {
	for (refyear in refyears) {
	message("aggregating to GMUs for reference year ", refyear)
	## calculate quantiles of changes over the set of bores in each GMU
	qqtmp <-
	overlayEach(pt.deltas[,toString(refyear)], gmus.state,
	fn = function(x) {
	qq <- quantile(x[,1], c(0.5, 0.05, 0.01), na.rm = TRUE)
	names(qq) <- c("median", "worst_5pc", "worst_1pc") ## max 10 chars!
	qq[["n_values"]] <- sum(!is.na(x[,1]))
	qq
	},
	out.n = 4)
	ii.na <- is.na(qqtmp[,"n_values"])
	qqtmp[ii.na, "n_values"] <- 0

	## write out aggregated stats as shapefile (polygons)
	gmus.ref <- gmus.state
	gmus.ref@data <- cbind(qqtmp, gmus.ref@data)

	writeOGR(gmus.ref,
	sprintf("%s/%s_gw_trends", ident, ident),
	sprintf("gmus_%i", refyear),
	driver = "ESRI Shapefile")

	pt.deltas.ref <- pt.deltas[,c(refyear, "stationID", "longName")]
	colnames(pt.deltas.ref@data)[1] <- "value"

	## generate maps (quantiles of changes in GMUs)
	foo <-
	gmuLevelPlot(gmus.ref[,1:4], pt.deltas.ref, refyear = refyear,
	xlab.top = ident)

	##pdf(sprintf("%s/%s_deltas%i_gmus.pdf", ident, ident, refyear),
	## paper = "a4r", width = 18, height = 10)
	##print(foo)
	##dev.off()

	png(sprintf("%s/%s_deltas%i_gmus.png", ident, ident, refyear),
	width = 6000, height = 3000, res = 300)
	print(foo)
	dev.off()
	}
	}

	if ("gmus" %in% do.what) {
	## generate detailed results for each GMU
	l_ply(seq(1, nrow(gmus.state)),
	.progress = myprogress("generating detailed results for each GMU"),
	function (i)
	{
	gmuname <- as.character(gmus.state@data[i, "GMU_NAME"])
	gmuno <- as.character(gmus.state@data[i, "GMU_NO"])
	message(gmuno, gmuname)
	gmunameok <- make.names(substr(gmuname, 1, 15))
	nameok <- paste(sprintf("%s/%s", ident, ident), gmuno, gmunameok, sep = "_")
	##pdf(paste(nameok, ".pdf", sep = ""), paper = "a4r", width = 18, height = 10)
	png(paste(nameok, "%01d.png", sep = ""), width = 3000, height = 1500, res = 300)

	anyOK <- FALSE
	for (refyear in refyears) {

	pt.deltas.ref <- pt.deltas[,c(toString(refyear), "stationID", "longName")]
	colnames(pt.deltas.ref@data)[1] <- "value"

	foo <-
	tryCatch(gmuLevelPlotGAM(gmus.state[,1], pt.deltas.ref, subset = i,
	refyear = refyear, cities = cities.all),
	error = force)
	if (inherits(foo, "error")) {
	message(foo$message)
	next
	} else {
	if (anyOK == FALSE) ## only once:
	print(foo$context)
	anyOK <- TRUE
	}

	print(foo$levels +
	layer(sp.lines(rivers.state, lwd = 2, col = "blue", alpha = 0.5),
	data = environment())
	)
	}

	if (anyOK) {
	foo <-
	tryCatch(gmuHorizonplot(gmus.state[,1], pt.deltas, subset = i,
	levelzoo = infolist$levels),
	error = force)
	if (inherits(foo, "error")) {
	message(foo$message)
	} else {
	print(foo)

	print(gmuDistributionTrend(gmus.state[,1], pt.deltas, subset = i,
	deltas = infolist$deltas))
	}
	}

	dev.off()
	})
	}
	}

	## merge a list of zoo objects (and put all on the same time scale).
	## return the result as a merged zoo matrix, indexed by seq(start, end, by = by)
	## This is similar to do.call("merge", tslist) but that can fail due to memory usage.
	merge_zoolist <- function(tslist, ..., by = 1L)
	{
	start <- do.call(min, lapply(tslist, start))
	end <- do.call(max, lapply(tslist, end))
	newindex <- seq(start, end, by = by)
	## put each series on the same index and extract core data
	coreagg <- llply(tslist, function(z) {
	if (length(z) == 0)
	z <- zoo(1) ## need this, otherwise merged series has 0 columns
	coredata(merge(z, zoo(, newindex), all = c(FALSE, TRUE)))
	}, ...)
	ans <- matrix(unlist(coreagg, use.names = FALSE),
	ncol = length(tslist), nrow = length(newindex))
	colnames(ans) <- names(tslist)
	zoo(ans, newindex)
	}


	gw_timeseries_process <- function(tslist, now)
	{
	ans <- list()

	message("Processing time series...")
	ntasks <- 4
	task.i <- 0

	## aggregate to regular monthly (median) series
	task.i <- task.i + 1
	message("Task ", task.i, " out of ", ntasks)
	ans$monthly <-
	llply(tslist,
	.progress = myprogress("Aggregating (by medians) to regular monthly series"),
	function(z)
	{
	z <- aggregate(z, as.yearmon,
	function(x) median(coredata(x), na.rm = TRUE))
	t <- as.yearmon(seq(as.Date(start(z)), as.Date(end(z)), by = "months"))
	merge(z, zoo(,t), all = c(FALSE, TRUE))
	})

	## calculate number of (monthly) observations in each calendar year for each site
	task.i <- task.i + 1
	message("Task ", task.i, " out of ", ntasks)
	nperyear <- llply(ans$monthly,
	.progress = myprogress("Counting number of observations each year"),
	function(z)
	{
	aggregate(z, as.integer, function(z) sum(complete.cases(z)))
	})
	nperyear <-
	merge_zoolist(nperyear, .progress = myprogress("Merging observations per year to matrix"))
	coredata(nperyear)[coredata(is.na(nperyear))] <- 0
	ans$nperyear <- nperyear

	## calculate seasonal fluctuations
	task.i <- task.i + 1
	message("Task ", task.i, " out of ", ntasks)
	ans$fluct <-
	unlist(llply(ans$monthly,
	.progress = myprogress("Calculating seasonal fluctuations"),
	function(z)
	{
	if (sum(!is.na(z)) < 10) return(NA)
	## first, remove a linear trend
	# y <- coredata(z)
	# x <- as.numeric(index(z))
	# coredata(z) <- residuals(MASS::rlm(y ~ x, na.action = na.exclude))
	median(aggregate(z, floor, function(z) {
	tmp <- diff(range(z))
	if (identical(tmp, 0)) NA else tmp
	}), na.rm = TRUE)
	}))

	## calculate annual values (median of monthly values).
	## BUT for years with only a single (monthly) value,
	## replace with the median over at least 3 values in a 5-year window.
	## -- to avoid influence of errors in single data points
	task.i <- task.i + 1
	message("Task ", task.i, " out of ", ntasks)
	ans$annual <-
	llply(ans$monthly,
	.progress = myprogress("Aggregating (by medians) to annual values; rolling median"),
	function(z)
	{
	## 1st step: calculate medians for each year
	## Note floor() applied to 'yearmon' gives the year (for aggregation)
	zann <-
	aggregate(z, floor, function(x) {
	median(coredata(x), na.rm = TRUE)
	})

	znum <-
	aggregate(z, floor, function(x) {
	sum(complete.cases(x))
	})

	## 2nd step: rolling median over chunks of 5 data-years
	tmp <- na.omit(zann)
	width.odd <- length(tmp) - (1 - length(tmp)%%2)
	zrollmed <-
	rollapply(tmp, width = min(5, width.odd),
	median, na.pad = TRUE)
	zrollmed <- merge(zrollmed, zoo(,time(zann)))

	## 3rd step: use the rolling medians to fill in
	## years which have fewer than 3 monthly observations.
	zann[znum < 3] <- zrollmed[znum < 3]
	## locf = last observation carry forward
	na.locf(na.trim(zann, sides = "left"))
	})
	ans$levels <- merge_zoolist(ans$annual,
	.progress = myprogress("Merging annual levels to matrix"))
	index(ans$levels) <- as.integer(index(ans$levels))

	## calculate time series of changes from the "now" year
	## create a matrix of the same size as $levels with just the "now" data
	levs.nowmatrix <- na.locf(merge(ans$levels[I(now),], zoo(,time(ans$levels))),
	fromLast = TRUE, na.rm = FALSE)
	ans$deltas <- window(ans$levels - levs.nowmatrix, end = now)

	ans
	}

	plot_all_timeseries <- function(tslist, ..., ident,
	nperpage = 20, y.same = NA)
	{
	label <- toString(deparse(substitute(tslist)), width = 20)
	## generate pdf with all time series
	pdf(paste(ident, "_timeseries.pdf", sep = ""), paper = "a4r", width = 10)
	l_ply(1:ceiling(length(tslist)/nperpage),
	.progress = myprogress(paste("Plotting", label, "time series to PDF")),
	function(i)
	{
	ii <- (i-1)*nperpage + 1:nperpage
	ii <- ii[ii <= length(tslist)]
	foo <-
	xyplot(tslist[ii], ..., y.same = y.same)
	try(print(foo))
	})
	dev.off()
	}

	plot_all_timecoverage <- function(tslist, ..., ident, nperpage = 50)
	{
	label <- toString(deparse(substitute(tslist)), width = 20)
	## generate pdf with all time series coverage
	pdf(paste(ident, "_timecoverage.pdf", sep = ""), paper = "a4r", width = 10)
	l_ply(1:ceiling(length(tslist)/nperpage),
	.progress = myprogress(paste("Plotting", label, "time coverage to PDF")),
	function(i)
	{
	ii <- (i-1)*nperpage + 1:nperpage
	ii <- ii[ii <= length(tslist)]
	foo <-
	xyplot(unname(tslist[ii]), layout = c(1,NA),
	strip = FALSE, strip.left = FALSE,
	ylab.right = list(names(tslist)[ii], cex = 0.4, rot = 0),
	panel = panel.xblocks,
	xlab = NULL,
	xscale.components = xscale.components.subticks,
	x.same = TRUE,
	scales = list(y = list(draw = FALSE),
	x = list(axs = "i", tick.number = 8)))
	foo <- update(foo, ...)
	try(print(foo))
	})
	dev.off()
	}

	plot_distributions <-
	function(tsinfo, ident, now, include.lines = FALSE)
	{
	ans <- list()

	## use all years with at least one observation
	tsok <- (tsinfo$nperyear >= 1)

	## plot distribution of available annual values over sites
	ts.ann.n <- sapply(tsinfo$annual, function(x) sum(!is.na(coredata(x))))
	ans$npersite <-
	ecdfplot(ts.ann.n, grid = TRUE,
	xscale.components = xscale.components.log10ticks,
	main = "Distribution of number of annual values",
	ylab = "Empirical CDF (cumulative fraction of sites)",
	xlab = "number of years available at each site")

	## plot number of sites with at least 1 monthly value, each year
	ans$sitesperyear <-
	xyplot(zoo(apply(tsok, 1, sum), index(tsok)),
	panel = panel.barchart, horizontal = FALSE, origin = 0,
	ylab = "number of sites with usable annual estimates", xlab = NULL,
	xscale.components = xscale.components.subticks) +
	layer_(panel.grid(h = -1, v = 0))

	## how many sites have data in BOTH a 1-YEAR reference period and the most recent 1-YEAR period?
	## (for several possible reference periods)
	isok.1year.now <- tsok[I(now),]
	isok.1year.each <- tsok
	isok.1year.each[,!isok.1year.now] <- FALSE
	ans$oksitesperyear <-
	barchart(apply(isok.1year.each, 1, sum) ~ format(time(isok.1year.each)),
	ylab = sprintf("number of sites with 1 year data\nin BOTH reference and current (%i)", now),
	origin = 0, scales = list(x = list(rot = 90))) +
	layer_(panel.grid(h = -1, v = 0))


	## how many sites have data in BOTH a 2-YEAR reference period and the most recent 2-YEAR period?
	## (for several possible reference periods)
	isok.2year.now <- aggregate(tsok[I(now-(1:0)),], rep(now-1, 2), all)
	isok.2year.each <- rollapply(tsok, width = 2, all, align = "right")
	isok.2year.each[,!isok.2year.now] <- FALSE
	ans$oksitesper2years <-
	barchart(apply(isok.2year.each, 1, sum) ~ format(time(isok.2year.each)),
	ylab = sprintf("number of sites with 2 years data\nin BOTH reference and current (%s)",
	paste(now-(1:0), collapse = "-")),
	origin = 0, scales = list(x = list(rot = 90))) +
	layer_(panel.grid(h = -1, v = 0))

	## plot distribution of changes over time
	ans$deltas.box <-
	plot_distribution_trend(tsinfo$deltas)

	if (include.lines) {
	ans$deltas.lines <-
	plot_distribution_trend(tsinfo$deltas, with.lines = TRUE)
	}

	ans
	}


	plot_distribution_trend <-
	function(deltas, ## e.g. vicinfo$deltas
	...,
	with.lines = FALSE)
	{
	## plot distribution of changes over time
	if (with.lines == FALSE) {
	ydat <- stack(as.data.frame(t(deltas)))
	## reverse time (to show "years ago")
	yearlabels <- time(deltas)
	ydat$ind <- factor(ydat$ind, levels = rev(time(deltas)), labels = rev(yearlabels))

	ans <-
	bwplot(values ~ ind, ydat, ylim = c(-20,20),
	pch = "\|", varwidth = TRUE, par.settings = simpleTheme(cex = 0.7),
	ylab = sprintf("change in water level from current (%s)", end(deltas)),
	xlab = "reference year",
	scales = list(x = list(rot = 90), y = list(tick.number = 9)))
	} else {
	## show individual sites as lines
	ans <-
	xyplot(deltas, screens = 1,
	xlim = rev(range(time(deltas))), alpha = 0.2,
	ylab = sprintf("change in water level from current (%s)", end(deltas)),
	xlab = "reference year")
	}

	ans <- ans +
	layer_(panel.grid(-1,-1, x = x, y = y), panel.abline(h=0)) +
	layer(panel.points(1, 0, pch = 16, col = "black"),
	panel.arrows(1, 2.5, 1, 0, length = 0.1),
	panel.text(1, 3, '"current" (comparison point)',
	srt = 90, pos = 4, offset = 0, cex = 0.7))
	ans
	}

	## aggregate points (sites) into groundwater management units.
	## NOTE can not use overlay() directly because it does not account for overlapping polygons!
	## So loop through and apply overlay() to each "Polygons" element:
	overlayEach <- function(spatialpoints, spatialpolys, ..., out.n = 1, fill = NA_real_)
	{
	foo <-
	ldply(spatialpolys@polygons,
	.progress = myprogress("Spatial aggregation"),
	function(p, ...) {
	ans <- unlist(overlay(spatialpoints, SpatialPolygons(list(p)), ...))
	if (length(ans) == 0) rep(fill, length = out.n) else ans
	}, ...)
	rownames(foo) <- rownames(spatialpolys@data)
	foo
	}

	## plot a SpatialPolygonsDataFrame and associated SpatialPointsDataFrame
	## or a given subset (typically only 1) of the polygons
	## (GMU = Groundwater Management Unit)
	gmuLevelPlot <-
	function(polydf, pointsdf, subset = NULL, ...,
	colrange = c(-15, 15),
	xlim = extendrange(bbox(subpolydf)[1, ], f = 0.04),
	ylim = extendrange(bbox(subpolydf)[2, ], f = 0.04),
	main = paste("Change in level (m) since", refyear), refyear,
	xlab.top = toString(rownames(polydf@data)[subset]),
	cities = cities.ok,
	par.settings = custom.theme.2())
	{
	## limit range of data, for plotting
	subcolrange <- extendrange(colrange, f = -0.01)
	polydf@data[] <- lapply(polydf@data, function(x)
	pmin(pmax(x, min(subcolrange)), max(subcolrange)))
	colnames(pointsdf@data)[1] <- "value"
	pointsdf@data$value <-
	pmin(pmax(pointsdf@data$value, min(subcolrange)), max(subcolrange))

	subpolydf <- polydf
	subpointsdf <- pointsdf
	if (!is.null(subset)) {
	subpolydf <- polydf[subset,]
	inside <- !is.na(overlay(pointsdf, subpolydf))
	subpointsdf <- pointsdf[inside,]
	}

	result <- list()

	if (!is.null(subset)) {
	result$context <-
	spplot(polydf[,1], main = main, colorkey = FALSE,
	col = "grey", col.regions = NA) +
	layer(panel.rect(min(xlim), min(ylim), max(xlim), max(ylim),
	col = "yellow", alpha = 0.5),
	sp.polygons(subpolydf),
	data = environment())
	}

	result$levels <-
	spplot(subpolydf, ...,
	at = seq(min(colrange), max(colrange), by = 1),
	xlim = xlim, ylim = ylim,
	lty = 0, as.table = TRUE, par.settings = par.settings,
	main = main, xlab.top = xlab.top) +
	layer_(panel.fill("grey90"), panel.grid(100,100, col = "white")) +
	layer(panel.levelplot.points(x = Lng, y = Lat, z = value,
	subscripts = TRUE, ..., cex = 0.5,
	col.symbol = "transparent"),
	## show translucent circles for all points (crosses for NAs)
	grid.points(x = Lng, y = Lat,
	pch = ifelse(is.na(value), 4, 1),
	gp = gpar(cex = 1, alpha = 0.1, lwd = 0.1)),
	data = as.data.frame(subpointsdf),
	packets = 1) +
	layer(panel.points(long, lat, pch = 15, cex = 0.6, col = "black"),
	panel.text(long, lat, name, pos = 1, cex = 0.6), data = cities) +
	layer(sp.polygons(subpolydf, lwd = 0.5, alpha = 0.5), data = environment())

	result
	}

	## take one polygon ('subset') from the SpatialPolygonsDataFrame
	## (GMU = Groundwater Management Unit)
	## with associated points from the given SpatialPointsDataFrame
	## and generate smoothed surface over the points using gam().
	## also returns a context plot of the 'subset' polygon.
	gmuLevelPlotGAM <-
	function(polydf, pointsdf, subset = NULL, ...,
	minimum.sites = 20,
	colrange = c(-15, 15),
	xlim = extendrange(bbox(subpolydf)[1, ], f = 0.04),
	ylim = extendrange(bbox(subpolydf)[2, ], f = 0.04),
	xlab.top = paste("Change in level (m) since", refyear), refyear,
	main = toString(rownames(polydf@data)[subset]),
	cities = cities.ok,
	par.settings = custom.theme.2())
	{
	stopifnot("value" %in% colnames(pointsdf@data))

	subpolydf <- polydf
	subpointsdf <- pointsdf
	if (!is.null(subset)) {
	subpolydf <- polydf[subset,]
	inside <- !is.na(overlay(pointsdf, subpolydf))
	subpointsdf <- pointsdf[inside,]
	}
	sitesok <- !is.na(subpointsdf@data$value)
	if (length(sitesok) < minimum.sites) {
	stop("number of sites less than ", minimum.sites)
	}
	if (sum(sitesok) == 0) {
	stop("no active sites (no non-missing values)")
	}

	## limit range of data, for plotting
	subcolrange <- extendrange(colrange, f = -0.01)
	subpolydf@data[] <- lapply(subpolydf@data, function(x)
	pmin(pmax(x, min(subcolrange)), max(subcolrange)))
	subpointsdf@data$value <-
	pmin(pmax(subpointsdf@data$value, min(subcolrange)), max(subcolrange))

	result <- list()

	if (!is.null(subset)) {
	result$context <-
	spplot(polydf[,1], main = main, colorkey = FALSE,
	col = "grey", col.regions = NA) +
	layer(panel.rect(min(xlim), min(ylim), max(xlim), max(ylim),
	col = "yellow", alpha = 0.5),
	sp.polygons(subpolydf),
	data = environment())
	}

	## fit a GAM and generate predictions on a regular grid inside the polygon
	samp <- spsample(subpolydf, n = 200 * 200, type = "regular")

	library(mgcv)
	mod <- try(gam(value ~ s(Lng,Lat), data = as.data.frame(subpointsdf)),
	silent = TRUE)
	if (inherits(mod, "try-error")) {
	## can't fit a GAM, so just fill with a constant
	pred <- rep(median(subpointsdf@data$value, na.rm = TRUE), nrow(as.data.frame(samp)))
	} else {
	pred <- as.vector(predict(mod, with(as.data.frame(samp), list(Lng = x1, Lat = x2))))
	}

	result$levels <-
	spplot(subpolydf, ...,
	at = seq(min(colrange), max(colrange), by = 1),
	xlim = xlim, ylim = ylim,
	par.settings = par.settings,
	panel = function(...) NULL,
	main = main, xlab.top = xlab.top) +
	layer_(panel.fill("grey90"), panel.grid(100, 100, col = "white")) +
	layer(panel.levelplot(x = x1, y = x2, z = pred, subscripts = TRUE, ...),
	data = c(list(pred = pred), as.data.frame(samp))) +
	layer(panel.levelplot.points(x = Lng, y = Lat, z = value,
	subscripts = TRUE, ..., cex = 1,
	col.symbol = "transparent"),
	## show translucent circles for all points (crosses for NAs)
	grid.points(x = Lng, y = Lat,
	pch = ifelse(is.na(value), 4, 1),
	gp = gpar(cex = 1, alpha = 0.3, lwd = 0.2)),
	data = as.data.frame(subpointsdf),
	packets = 1) +
	layer(sp.polygons(subpolydf, lwd = 0.5, alpha = 0.5), data = environment())

	result
	}

	## horizonplot of time series (levelzoo)
	## using all sites within the given polydf[subset,]
	gmuHorizonplot <-
	function(polydf, pointsdf, subset = NULL, ...,
	levelzoo, ## e.g. vicinfo$levels
	horizonscale = 5,
	origin = function(y) mean(range(y, na.rm = TRUE)))
	{
	subpolydf <- polydf
	subpointsdf <- pointsdf
	if (!is.null(subset)) {
	subpolydf <- polydf[subset,]
	inside <- !is.na(overlay(pointsdf, subpolydf))
	subpointsdf <- pointsdf[inside,]
	}
	if (nrow(subpointsdf) == 0)
	stop("no values")

	stopifnot(c("stationID", "longName") %in% colnames(pointsdf@data))
	siteids <- as.character(subpointsdf@data$stationID)
	sitenames <- as.character(subpointsdf@data$longName)
	sitenames <- substr(sitenames, 1, 18)
	ord <- order(siteids)
	siteids <- siteids[ord]
	sitenames <- sitenames[ord]
	result <-
	horizonplot(0 - levelzoo[,siteids], strip.left = FALSE,
	horizonscale = horizonscale,
	origin = origin,
	colorkey = list(space = "left"), layout = c(1, NA),
	scales = list(x = list(axs = "i")), xlab = NULL,
	par.settings = list(axis.line = list(col = "gray50")),
	ylab = list(rev(siteids), rot = 0, cex = 0.3),
	ylab.right = list(sitenames, rot = 0, cex = 0.3))

	update(result, ...)
	}

	## boxplots of distributions of changes over time (deltas)
	## using all sites within the given polydf[subset,]
	gmuDistributionTrend <-
	function(polydf, pointsdf, subset = NULL, ...,
	deltas) ## e.g. vicinfo$deltas
	{
	subpolydf <- polydf
	subpointsdf <- pointsdf
	if (!is.null(subset)) {
	subpolydf <- polydf[subset,]
	inside <- !is.na(overlay(pointsdf, subpolydf))
	subpointsdf <- pointsdf[inside,]
	}
	if (nrow(subpointsdf) == 0)
	stop("no values")

	stopifnot(c("stationID", "longName") %in% colnames(pointsdf@data))
	siteids <- as.character(subpointsdf@data$stationID)
	sitenames <- as.character(subpointsdf@data$longName)
	sitenames <- substr(sitenames, 1, 18)
	ord <- order(siteids)
	siteids <- siteids[ord]
	sitenames <- sitenames[ord]
	plot_distribution_trend(deltas[,siteids], ...)
	}
	## process the data for NSW

	## TODO: reading in the whole table may not be feasible on some machines.
	## It would probably be better to read in data for each site using SQL?
	nsw <- sqlFetch(conn, "NSW")

	## convert to zoo and remove obvious bad values
	nswlist <-
	llply(split(1:nrow(nsw), nsw$stationID),
	.progress = myprogress("Splitting by site and converting to zoo objects"),
	function(ii)
	{
	x <- nsw[ii,]
	## remove zero values (from visual inspection this is necessary):
	x <- subset(x, StandingWaterLevel != 0.0)
	## TODO: negative values?
	z <- suppressWarnings(zoo(x$StandingWaterLevel, x$observTime))
	})

	rm(nsw)

	## remove series without any observations
	nswlist <- nswlist[sapply(nswlist, function(x) sum(!is.na(coredata(x)))) > 0]

	## remove all discontinued sites (those with no data after 2006)
	nswlist <- nswlist[sapply(nswlist, end) >= as.POSIXct("2006-01-01")]

	## distribution of start times
	as.yearmon(quantile(do.call("c", lapply(nswlist, start))))
	## distribution of end times
	as.yearmon(quantile(do.call("c", lapply(nswlist, end))))

	nswinfo <- gw_timeseries_process(nswlist, now = 2007)
	save(nswinfo, file = "nswinfo.Rdata")

	#load("nswinfo.Rdata")

	## summary of changes each year (from current)
	#summary(t(nswinfo$deltas))

	## plot up some samples of the monthly + annual series, interactively
	devAskNewPage(TRUE)
	#pdf("nsw_samples.pdf", paper = "a4r", width = 18, height = 10)
	for (i in head(seq(0, length(nswinfo$monthly), length = 10), -1)) {
	print(update(xyplot(nswinfo$monthly[1:20 + i], type = "b", cex = 0.7),
	scales = list(y = "sliced", x = "same")) +
	xyplot(nswinfo$annual[1:20 + i], type = "s", col = "red", lwd = 2))
	}
	#dev.off()
	devAskNewPage(FALSE)

	plot_all_timeseries(nswinfo$monthly, ident = "nsw_monthly",
	x.same = TRUE)

	plot_all_timeseries(nswinfo$annual, ident = "nsw_annual", type = "s",
	xlim = extendrange(c(1970,2010)))

	## slow!
	plot_all_timecoverage(nswinfo$monthly, ident = "nsw",
	xlim = c(1970,2010))

	## the real stuff:
	do_groundwater_analysis(nswinfo, state = "NSW")