Skip to content

Instantly share code, notes, and snippets.

@brownag
Last active July 13, 2020 01:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brownag/946af2a5a56bb4c1dd4007e3024dd4c6 to your computer and use it in GitHub Desktop.
Save brownag/946af2a5a56bb4c1dd4007e3024dd4c6 to your computer and use it in GitHub Desktop.
Use aqp to handle missing data and typical pitfalls with field profile descriptions for IO from new R package mpspline2 by @obrl_soil
# making use of @obrl-soil's new mpspline2 package [soon to be on CRAN!]
#
# author: andrew g brown; 2020/07/12
#
# this is an attempt at creating a generic method for mass-preserving / equal-area splines
# on aqp SoilProfileCollection (SPC) data that may contain a large amount of missing values.
#
# it also provides an avenue to get 1cm continuous spline output back in a handy form (1cm sliced SPC).
#
# In the presence of missing values, getting the result back into the SPC is not always easy.
#
# The example code that follows should illustrate the major indexing pitfalls you need to
# be aware of.
#
# I have tried to cover corner cases so please let me know if you find a way to break it!
#
# get development versions of soil packages + dependencies
# remotes::install_github(c("ncss-tech/aqp",
# "ncss-tech/soilDB",
# "obrl-soil/mpspline2"))
# load packages
library(aqp) # for SoilProfileCollection object + wrangling code
library(soilDB) # sample data
library(mpspline2) # generic mass-preserving/equal area spline implementation
#' Missing-data-safe, SPC-wide wrapper around mpspline2::mpspline "continuous" 1cm output
#'
#' @name do_aqp_mpspline
#'
#' @description Facilitate safe use of just about any numeric SPC horizon attribute, from any SPC, with \code{mpspline2::mpspline}.
#'
#' This function will automatically filter profiles with \code{NA} in attribute of interest.
#'
#' This may be more conservative filtering than you expect, with intention of splines over a constant interpolation interval [with essentially no gaps].
#'
#' This is all with an eye toward aggregating many profile-level splines together where missing data is hard to reason over.
#'
#' Data completeness is assessed and the input SPC is filtered and truncated to create a container for the 1cm results from \code{mpspline2::mpspline}.
#'
#' @param object A SoilProfileCollection
#' @param var_name Column name in \code{@horizons} slot of \code{object} containing numeric values to spline
#' @param pattern Regex pattern to match for bottom of profile (passed to estimateSoilDepth) default: "R|Cr|Cd|qm"
#' @param hzdesgn Column name in \code{@horizons} slot of \code{object} containing horizon designations default: \code{aqp::guessHzDesgnName(object)}
#' @param ... Additional arguments to \code{mpspline2::mpspline}
#'
#' @author Andrew G. Brown
#'
#' @return A SoilProfileCollection with 1cm slices. Spline variables are in columns prefixed with "spline_" and RMSE/RMSE_IQR are in colums prefixed with "rmse_". If any profiles were removed from the collection, their profile IDs are stored in attr(result, 'removed').
#'
#' @export do_aqp_mpspline
#'
#' @examples
#'
#' library(aqp)
#' data(sp1)
#' depths(sp1) <- id ~ top + bottom
#'
#' res <- do_aqp_mpspline(sp1, "prop")
#' plotSPC(res[1:5,], color = "spline_prop", divide.hz = FALSE)
#'
do_aqp_mpspline <- function(object, var_name = NULL,
pattern = "R|Cr|Cd|qm", hzdesgn = guessHzDesgnName(object),
...) {
if(is.null(var_name) | !(var_name %in% horizonNames(object)))
stop("argument `var_name` must specify a single horizon-level variable", call.=FALSE)
hztop <- horizonDepths(object)[1]
hzbot <- horizonDepths(object)[2]
# glom to "available interval" in each profile
# NOTE: we will handle warnings (profiles with no data at all) at end
spc.sub <- suppressWarnings(glomApply(object, function(p) {
i <- which(diff(c(0, cumsum(!is.na(p[[var_name]])))) == 1)
h <- horizons(p)
# basically this excludes NA values at top and bottom of profile
# (O horizons, bedrock) but wont check missing values inbetween
# need at least two horizons to make a spline
if(length(i) < 2)
return(c(0,0))
top_depth <- h[[hztop]][i[1]]
bot_depth <- h[[hzbot]][i[length(i)]]
return(c(top_depth, bot_depth))
}))
# debug : inspect horizon values for var_name
#plot(spc.sub[1:10,], color=var_name)
# only take profiles that have 100% data coverage in above interval
# i.e. if a single horizon is missing data, remove whole profile
spc.sub$nona <- profileApply(spc.sub, function(p) any(is.na(p[[var_name]])))
spc.sub <- spc.sub[which(!spc.sub$nona),]
# calculate the deepest top depth and shallowest bottom depth
mindepth <- max(profileApply(spc.sub, function(p) p[,1][[hztop]]))
maxdepth <- min(profileApply(spc.sub, estimateSoilDepth, p = pattern, name = hzdesgn))
# we will only make interpolations that the "whole SPC supports"
# the thought is that these 1cm slices will be further aggregated downstream
spc.sub <- glomApply(spc.sub, function(p) c(mindepth, maxdepth), truncate = TRUE)
# do the splines
res <- mpspline2::mpspline(horizons(spc.sub)[c(idname(spc.sub),
horizonDepths(spc.sub),
var_name)],
var_name = var_name, ...)
# concatenate results for re-insertion
res2 <- do.call('c', lapply(profile_id(spc.sub), function(pid) {
drange <- mindepth:maxdepth
zero.idx <- drange == 0
if(any(zero.idx))
drange <- drange[-which(zero.idx)]
return(res[[pid]]$est_1cm[drange])
# this returns the 1cm estimate which conforms with sliced spc
#
# debug: prove that mass is preserved in output by returning block estimates
# return(res[[pid]]$est_icm)
}))
# get the RMSE
reserr <- do.call('c', lapply(profile_id(spc.sub), function(pid) {
return(res[[pid]]$est_err)
}))
# make 1:1 with site
reserr_iqr <- reserr[names(reserr) == "RMSE_IQR"]
reserr <- reserr[names(reserr) == "RMSE"]
# inspect
#reserr_iqr
#reserr
# single horizon results cannot be splined, filter those out
spc.sub <- filter(spc.sub, profile_id(spc.sub) %in% names(res))
# adjustment for aqp::slice index logic versus glom interval logic
if(mindepth == 0) {
maxdepth <- maxdepth - 1
}
# create slices 1cm thick to insert spline result
spc.spl <- aqp::slice(spc.sub, formula(sprintf("%s:%s ~ %s",
mindepth, maxdepth,
var_name)))
# create new "spline_"+var_name variable
spc.spl[[paste0("spline_",var_name)]] <- res2
# create new "rmse_"+var_name as site level attributes
spc.spl[[paste0("rmse_",var_name)]] <- reserr
spc.spl[[paste0("rmse_iqr_",var_name)]] <- reserr_iqr
# determine what profiles were removed
removed <- profile_id(object)[!profile_id(object) %in% profile_id(spc.spl)]
# add an attribute with removed profile IDs. there are three steps
# that possibly remove data:
# - profiles removed by glomApply have no var_name data at all.
# - 100% coverage filtering step -- conservative filter to keep from making splines from bad data
# - mpspline itself will remove profiles with e.g. just one horizon
attr(spc.spl, "removed") <- unique(removed)
return(spc.spl)
}
###
### DEMO CODE
###
### make a combined comparison profile plot
### raw pedon data v.s. 1cm-slice + mass preserving spline
# load sample dataset
data(loafercreek, package = "soilDB")
# set all O horizons to 10 percent clay and pH 5.5
# this isnt "necessary" but allows for interpolation to 0cm in loafercreek
loafercreek$clay[grep("O", loafercreek$hzname)] <- 10
loafercreek$phfield[grep("O", loafercreek$hzname)] <- 5.5
# use aqp wrapper function for SPC input to mpspline
res1 <- do_aqp_mpspline(loafercreek, "clay")
# NOTE: 7 profiles are dropped from loafercreek -> res
# all of them contain no clay data at all
attr(res1, 'removed')
# inspect distribution of RMSE (one for each profile)
plot(density(res1$rmse_clay))
abline(v=quantile(res1$rmse_clay))
# set graphics parameters to minimize whitespace/leave room for legend
par(mar = c(1,1,3,1))
# make sure the removed profiles deserved to be removed
plot(filter(loafercreek, profile_id(loafercreek) %in% attr(res1,'removed')),
color = "clay")
# NOTE: you can run this on anything numeric... but with big datasets
# just a couple pedons missing data at a few depths will limit your range
res2 <- do_aqp_mpspline(loafercreek, "phfield")
plot(res2, color = "spline_phfield", n.legend = 5, divide.hz=FALSE, print.id=FALSE)
attr(res2, 'removed')
# more complete SPC gives broader range of depths
res3 <- do_aqp_mpspline(loafercreek[20:50], "phfield")
plot(res3[7:14,], color = "spline_phfield", n.legend = 5)
attr(res3, 'removed')
# take just a few profiles from the result for plot
res.sub <- res1[3:8,]
# get the original profiles
original <- filter(loafercreek, profile_id(loafercreek) %in% profile_id(res.sub))
# determine max depth in spline output (function of NA values)
max_d <- max(res.sub$hzdepb)
# use glom to truncate inputs to just interval matching spline output
orig.glom <- glomApply(original, function(p) {
return(c(0, max_d)) }, truncate = TRUE)
# set new profile IDs for the spline'd profiles
profile_id(res.sub) <- paste0(profile_id(res.sub),"_mpspline")
# put spline results into variable for plotting
orig.glom$clay_combined <- orig.glom$clay
res.sub$clay_combined <- res.sub$spline_clay
# inspect
plot(res.sub[,48])
# use aqp::union to merge multiple SPCs
spc.combined <- aqp::union(list(orig.glom, res.sub))
# avoid warnings about duplicate phiid
hzidname(spc.combined) <- "hzID"
# make the comparison plot
plotSPC(spc.combined, color = "clay_combined",
plot.order = c(1,7,2,8,3,9,4,10,5,11,6,12))
# different way of looking at things
dev.off()
# set up an xy plot
plot(y=spc.combined$hzdept, x=spc.combined$spline_clay, type="n",
xlab="Total Clay Content (%)", ylab="Depth (cm)",
ylim=c(50,0), xlim=c(10,35))
# add line for profiles # 7 and 8
lapply(7:8, function(i) {
lines(y=spc.combined[i,]$hzdept, x=spc.combined[i,]$clay, col=i, lwd=2)
lines(y=spc.combined[i,]$hzdept, x=spc.combined[i,]$spline_clay, col=i, lty=2) } )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment