Last active
July 13, 2020 01:26
-
-
Save brownag/946af2a5a56bb4c1dd4007e3024dd4c6 to your computer and use it in GitHub Desktop.
Use aqp to handle missing data and typical pitfalls with field profile descriptions for IO from new R package mpspline2 by @obrl_soil
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# making use of @obrl-soil's new mpspline2 package [soon to be on CRAN!] | |
# | |
# author: andrew g brown; 2020/07/12 | |
# | |
# this is an attempt at creating a generic method for mass-preserving / equal-area splines | |
# on aqp SoilProfileCollection (SPC) data that may contain a large amount of missing values. | |
# | |
# it also provides an avenue to get 1cm continuous spline output back in a handy form (1cm sliced SPC). | |
# | |
# In the presence of missing values, getting the result back into the SPC is not always easy. | |
# | |
# The example code that follows should illustrate the major indexing pitfalls you need to | |
# be aware of. | |
# | |
# I have tried to cover corner cases so please let me know if you find a way to break it! | |
# | |
# get development versions of soil packages + dependencies | |
# remotes::install_github(c("ncss-tech/aqp", | |
# "ncss-tech/soilDB", | |
# "obrl-soil/mpspline2")) | |
# load packages | |
library(aqp) # for SoilProfileCollection object + wrangling code | |
library(soilDB) # sample data | |
library(mpspline2) # generic mass-preserving/equal area spline implementation | |
#' Missing-data-safe, SPC-wide wrapper around mpspline2::mpspline "continuous" 1cm output | |
#' | |
#' @name do_aqp_mpspline | |
#' | |
#' @description Facilitate safe use of just about any numeric SPC horizon attribute, from any SPC, with \code{mpspline2::mpspline}. | |
#' | |
#' This function will automatically filter profiles with \code{NA} in attribute of interest. | |
#' | |
#' This may be more conservative filtering than you expect, with intention of splines over a constant interpolation interval [with essentially no gaps]. | |
#' | |
#' This is all with an eye toward aggregating many profile-level splines together where missing data is hard to reason over. | |
#' | |
#' Data completeness is assessed and the input SPC is filtered and truncated to create a container for the 1cm results from \code{mpspline2::mpspline}. | |
#' | |
#' @param object A SoilProfileCollection | |
#' @param var_name Column name in \code{@horizons} slot of \code{object} containing numeric values to spline | |
#' @param pattern Regex pattern to match for bottom of profile (passed to estimateSoilDepth) default: "R|Cr|Cd|qm" | |
#' @param hzdesgn Column name in \code{@horizons} slot of \code{object} containing horizon designations default: \code{aqp::guessHzDesgnName(object)} | |
#' @param ... Additional arguments to \code{mpspline2::mpspline} | |
#' | |
#' @author Andrew G. Brown | |
#' | |
#' @return A SoilProfileCollection with 1cm slices. Spline variables are in columns prefixed with "spline_" and RMSE/RMSE_IQR are in colums prefixed with "rmse_". If any profiles were removed from the collection, their profile IDs are stored in attr(result, 'removed'). | |
#' | |
#' @export do_aqp_mpspline | |
#' | |
#' @examples | |
#' | |
#' library(aqp) | |
#' data(sp1) | |
#' depths(sp1) <- id ~ top + bottom | |
#' | |
#' res <- do_aqp_mpspline(sp1, "prop") | |
#' plotSPC(res[1:5,], color = "spline_prop", divide.hz = FALSE) | |
#' | |
do_aqp_mpspline <- function(object, var_name = NULL, | |
pattern = "R|Cr|Cd|qm", hzdesgn = guessHzDesgnName(object), | |
...) { | |
if(is.null(var_name) | !(var_name %in% horizonNames(object))) | |
stop("argument `var_name` must specify a single horizon-level variable", call.=FALSE) | |
hztop <- horizonDepths(object)[1] | |
hzbot <- horizonDepths(object)[2] | |
# glom to "available interval" in each profile | |
# NOTE: we will handle warnings (profiles with no data at all) at end | |
spc.sub <- suppressWarnings(glomApply(object, function(p) { | |
i <- which(diff(c(0, cumsum(!is.na(p[[var_name]])))) == 1) | |
h <- horizons(p) | |
# basically this excludes NA values at top and bottom of profile | |
# (O horizons, bedrock) but wont check missing values inbetween | |
# need at least two horizons to make a spline | |
if(length(i) < 2) | |
return(c(0,0)) | |
top_depth <- h[[hztop]][i[1]] | |
bot_depth <- h[[hzbot]][i[length(i)]] | |
return(c(top_depth, bot_depth)) | |
})) | |
# debug : inspect horizon values for var_name | |
#plot(spc.sub[1:10,], color=var_name) | |
# only take profiles that have 100% data coverage in above interval | |
# i.e. if a single horizon is missing data, remove whole profile | |
spc.sub$nona <- profileApply(spc.sub, function(p) any(is.na(p[[var_name]]))) | |
spc.sub <- spc.sub[which(!spc.sub$nona),] | |
# calculate the deepest top depth and shallowest bottom depth | |
mindepth <- max(profileApply(spc.sub, function(p) p[,1][[hztop]])) | |
maxdepth <- min(profileApply(spc.sub, estimateSoilDepth, p = pattern, name = hzdesgn)) | |
# we will only make interpolations that the "whole SPC supports" | |
# the thought is that these 1cm slices will be further aggregated downstream | |
spc.sub <- glomApply(spc.sub, function(p) c(mindepth, maxdepth), truncate = TRUE) | |
# do the splines | |
res <- mpspline2::mpspline(horizons(spc.sub)[c(idname(spc.sub), | |
horizonDepths(spc.sub), | |
var_name)], | |
var_name = var_name, ...) | |
# concatenate results for re-insertion | |
res2 <- do.call('c', lapply(profile_id(spc.sub), function(pid) { | |
drange <- mindepth:maxdepth | |
zero.idx <- drange == 0 | |
if(any(zero.idx)) | |
drange <- drange[-which(zero.idx)] | |
return(res[[pid]]$est_1cm[drange]) | |
# this returns the 1cm estimate which conforms with sliced spc | |
# | |
# debug: prove that mass is preserved in output by returning block estimates | |
# return(res[[pid]]$est_icm) | |
})) | |
# get the RMSE | |
reserr <- do.call('c', lapply(profile_id(spc.sub), function(pid) { | |
return(res[[pid]]$est_err) | |
})) | |
# make 1:1 with site | |
reserr_iqr <- reserr[names(reserr) == "RMSE_IQR"] | |
reserr <- reserr[names(reserr) == "RMSE"] | |
# inspect | |
#reserr_iqr | |
#reserr | |
# single horizon results cannot be splined, filter those out | |
spc.sub <- filter(spc.sub, profile_id(spc.sub) %in% names(res)) | |
# adjustment for aqp::slice index logic versus glom interval logic | |
if(mindepth == 0) { | |
maxdepth <- maxdepth - 1 | |
} | |
# create slices 1cm thick to insert spline result | |
spc.spl <- aqp::slice(spc.sub, formula(sprintf("%s:%s ~ %s", | |
mindepth, maxdepth, | |
var_name))) | |
# create new "spline_"+var_name variable | |
spc.spl[[paste0("spline_",var_name)]] <- res2 | |
# create new "rmse_"+var_name as site level attributes | |
spc.spl[[paste0("rmse_",var_name)]] <- reserr | |
spc.spl[[paste0("rmse_iqr_",var_name)]] <- reserr_iqr | |
# determine what profiles were removed | |
removed <- profile_id(object)[!profile_id(object) %in% profile_id(spc.spl)] | |
# add an attribute with removed profile IDs. there are three steps | |
# that possibly remove data: | |
# - profiles removed by glomApply have no var_name data at all. | |
# - 100% coverage filtering step -- conservative filter to keep from making splines from bad data | |
# - mpspline itself will remove profiles with e.g. just one horizon | |
attr(spc.spl, "removed") <- unique(removed) | |
return(spc.spl) | |
} | |
### | |
### DEMO CODE | |
### | |
### make a combined comparison profile plot | |
### raw pedon data v.s. 1cm-slice + mass preserving spline | |
# load sample dataset | |
data(loafercreek, package = "soilDB") | |
# set all O horizons to 10 percent clay and pH 5.5 | |
# this isnt "necessary" but allows for interpolation to 0cm in loafercreek | |
loafercreek$clay[grep("O", loafercreek$hzname)] <- 10 | |
loafercreek$phfield[grep("O", loafercreek$hzname)] <- 5.5 | |
# use aqp wrapper function for SPC input to mpspline | |
res1 <- do_aqp_mpspline(loafercreek, "clay") | |
# NOTE: 7 profiles are dropped from loafercreek -> res | |
# all of them contain no clay data at all | |
attr(res1, 'removed') | |
# inspect distribution of RMSE (one for each profile) | |
plot(density(res1$rmse_clay)) | |
abline(v=quantile(res1$rmse_clay)) | |
# set graphics parameters to minimize whitespace/leave room for legend | |
par(mar = c(1,1,3,1)) | |
# make sure the removed profiles deserved to be removed | |
plot(filter(loafercreek, profile_id(loafercreek) %in% attr(res1,'removed')), | |
color = "clay") | |
# NOTE: you can run this on anything numeric... but with big datasets | |
# just a couple pedons missing data at a few depths will limit your range | |
res2 <- do_aqp_mpspline(loafercreek, "phfield") | |
plot(res2, color = "spline_phfield", n.legend = 5, divide.hz=FALSE, print.id=FALSE) | |
attr(res2, 'removed') | |
# more complete SPC gives broader range of depths | |
res3 <- do_aqp_mpspline(loafercreek[20:50], "phfield") | |
plot(res3[7:14,], color = "spline_phfield", n.legend = 5) | |
attr(res3, 'removed') | |
# take just a few profiles from the result for plot | |
res.sub <- res1[3:8,] | |
# get the original profiles | |
original <- filter(loafercreek, profile_id(loafercreek) %in% profile_id(res.sub)) | |
# determine max depth in spline output (function of NA values) | |
max_d <- max(res.sub$hzdepb) | |
# use glom to truncate inputs to just interval matching spline output | |
orig.glom <- glomApply(original, function(p) { | |
return(c(0, max_d)) }, truncate = TRUE) | |
# set new profile IDs for the spline'd profiles | |
profile_id(res.sub) <- paste0(profile_id(res.sub),"_mpspline") | |
# put spline results into variable for plotting | |
orig.glom$clay_combined <- orig.glom$clay | |
res.sub$clay_combined <- res.sub$spline_clay | |
# inspect | |
plot(res.sub[,48]) | |
# use aqp::union to merge multiple SPCs | |
spc.combined <- aqp::union(list(orig.glom, res.sub)) | |
# avoid warnings about duplicate phiid | |
hzidname(spc.combined) <- "hzID" | |
# make the comparison plot | |
plotSPC(spc.combined, color = "clay_combined", | |
plot.order = c(1,7,2,8,3,9,4,10,5,11,6,12)) | |
# different way of looking at things | |
dev.off() | |
# set up an xy plot | |
plot(y=spc.combined$hzdept, x=spc.combined$spline_clay, type="n", | |
xlab="Total Clay Content (%)", ylab="Depth (cm)", | |
ylim=c(50,0), xlim=c(10,35)) | |
# add line for profiles # 7 and 8 | |
lapply(7:8, function(i) { | |
lines(y=spc.combined[i,]$hzdept, x=spc.combined[i,]$clay, col=i, lwd=2) | |
lines(y=spc.combined[i,]$hzdept, x=spc.combined[i,]$spline_clay, col=i, lty=2) } ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment