brownag/big-SPC-gNATSGO.R

## big-SPC-gNATSGO.R
library(aqp)
library(sf)
library(soilDB)
library(data.table)
library(microbenchmark)

# path to gNATSGO Geodatabase
dsn <- "~/geodata/gNATSGO/2020/gNATSGO_CONUS.gdb"

# This is one of the main bottlenecks (not the only one) to reading out of GDB
#  chorizon is a pretty large amount of data. Let's save it as a flat file (2.1GB uncompressd)
#
#  in principle, I think all tables could be pre-read out of the GDB and then joined to a
#  SPC "skeleton" based off of the chorizon data.
#
# system.time(h <- sf::read_sf(dsn = dsn, layer = "chorizon",
#                             stringsAsFactors = FALSE, as_tibble = FALSE))
# # user     system elapsed
# # 216.374   5.678 222.533
#
# data.table::fwrite(h, file="~/geodata/gNATSGO/2020/horizons.csv")

# read the data in from flat file (TODO: consider SQLite storage of all tables and select * from chorizon)
starttime <- Sys.time()
gnatsgo <- data.table::fread(file = "~/geodata/gNATSGO/2020/horizons.csv")
print(Sys.time() - starttime)
# Time difference of 9.053402 secs

system.time(depths(gnatsgo) <- cokey ~ hzdept_r + hzdepb_r)
# user  system elapsed
# 78.714   1.057  77.307

system.time(l <-  get_legend_from_GDB(dsn, "areasymbol != ''"))
# user  system elapsed
# 0.196   0.000   0.195

system.time(m <- get_mapunit_from_GDB(dsn, "muname != ''"))
# user  system elapsed
# 6.555   0.077   6.926

system.time(co <- get_component_from_GDB(dsn = dsn, "compname != ''"))
# user  system elapsed
# 35.453   0.384  36.051

# need chunking
# cgd <- soilDB:::.get_cogeomordesc_from_GDB(dsn, co = co)
# pmg <- soilDB:::.get_copmgrp_from_GDB(gdb.path, co = co)

system.time(site(gnatsgo) <- co)
# user  system elapsed
# 4.334   0.004   3.248

system.time(site(gnatsgo) <- m)
# user  system elapsed
# 3.247   0.000   2.700

system.time(site(gnatsgo) <- l)
# user  system elapsed
# 2.838   0.000   2.499

microbenchmark::microbenchmark({ a <- gnatsgo[round(runif(1, 1, length(gnatsgo))), 1]},
                                { b <- gnatsgo[round(runif(1, 1, length(gnatsgo))), 1:2]},
                               { c <- gnatsgo[round(runif(2, 1, length(gnatsgo))), 1]},
                               { d <- gnatsgo[round(runif(2, 1, length(gnatsgo))), 1:2]},
                               times = 10)

# one or two [random] pedons; one or two horizons [first or first and second]
# # Unit: milliseconds
#                                                         expr      min       lq     mean   median       uq      max neval cld
# {     a <- gnatsgo[round(runif(1, 1, length(gnatsgo))), 1] } 116.9621 126.7610 131.7065 135.5488 136.5518 139.0037    10  a
# {     b <- gnatsgo[round(runif(1, 1, length(gnatsgo))), 1:2] } 121.5565 134.4841 135.2277 137.6504 139.2550 141.0407    10  a
# {     c <- gnatsgo[round(runif(2, 1, length(gnatsgo))), 1] } 117.7989 137.6253 140.2133 138.9760 151.9303 156.8037    10  ab
# {     d <- gnatsgo[round(runif(2, 1, length(gnatsgo))), 1:2] } 132.5007 135.7209 149.1425 154.1004 155.4409 161.5209    10   b


# open-ended j indexing (generally a pretty slow operation)
microbenchmark::microbenchmark({foo <- gnatsgo[,1]
                                foo <- NULL},
                               {foo <- gnatsgo[,2:3]
                                foo <- NULL},
                              times = 5)
# first horizon or second and third horizon, all profiles
# Unit: seconds
#                                        expr      min       lq     mean   median       uq      max neval cld
# {     foo <- gnatsgo[, 1]     foo <- NULL } 14.97619 15.46312 15.78757 16.09652 16.13384 16.26816     5  a
# {     foo <- gnatsgo[, 2:3]     foo <- NULL } 18.65325 18.68035 18.85315 18.83340 19.03189 19.06688     5   b
	library(aqp)
	library(sf)
	library(soilDB)
	library(data.table)
	library(microbenchmark)

	# path to gNATSGO Geodatabase
	dsn <- "~/geodata/gNATSGO/2020/gNATSGO_CONUS.gdb"

	# This is one of the main bottlenecks (not the only one) to reading out of GDB
	# chorizon is a pretty large amount of data. Let's save it as a flat file (2.1GB uncompressd)
	#
	# in principle, I think all tables could be pre-read out of the GDB and then joined to a
	# SPC "skeleton" based off of the chorizon data.
	#
	# system.time(h <- sf::read_sf(dsn = dsn, layer = "chorizon",
	# stringsAsFactors = FALSE, as_tibble = FALSE))
	# # user system elapsed
	# # 216.374 5.678 222.533
	#
	# data.table::fwrite(h, file="~/geodata/gNATSGO/2020/horizons.csv")

	# read the data in from flat file (TODO: consider SQLite storage of all tables and select * from chorizon)
	starttime <- Sys.time()
	gnatsgo <- data.table::fread(file = "~/geodata/gNATSGO/2020/horizons.csv")
	print(Sys.time() - starttime)
	# Time difference of 9.053402 secs

	system.time(depths(gnatsgo) <- cokey ~ hzdept_r + hzdepb_r)
	# user system elapsed
	# 78.714 1.057 77.307

	system.time(l <- get_legend_from_GDB(dsn, "areasymbol != ''"))
	# user system elapsed
	# 0.196 0.000 0.195

	system.time(m <- get_mapunit_from_GDB(dsn, "muname != ''"))
	# user system elapsed
	# 6.555 0.077 6.926

	system.time(co <- get_component_from_GDB(dsn = dsn, "compname != ''"))
	# user system elapsed
	# 35.453 0.384 36.051

	# need chunking
	# cgd <- soilDB:::.get_cogeomordesc_from_GDB(dsn, co = co)
	# pmg <- soilDB:::.get_copmgrp_from_GDB(gdb.path, co = co)

	system.time(site(gnatsgo) <- co)
	# user system elapsed
	# 4.334 0.004 3.248

	system.time(site(gnatsgo) <- m)
	# user system elapsed
	# 3.247 0.000 2.700

	system.time(site(gnatsgo) <- l)
	# user system elapsed
	# 2.838 0.000 2.499

	microbenchmark::microbenchmark({ a <- gnatsgo[round(runif(1, 1, length(gnatsgo))), 1]},
	{ b <- gnatsgo[round(runif(1, 1, length(gnatsgo))), 1:2]},
	{ c <- gnatsgo[round(runif(2, 1, length(gnatsgo))), 1]},
	{ d <- gnatsgo[round(runif(2, 1, length(gnatsgo))), 1:2]},
	times = 10)

	# one or two [random] pedons; one or two horizons [first or first and second]
	# # Unit: milliseconds
	# expr min lq mean median uq max neval cld
	# { a <- gnatsgo[round(runif(1, 1, length(gnatsgo))), 1] } 116.9621 126.7610 131.7065 135.5488 136.5518 139.0037 10 a
	# { b <- gnatsgo[round(runif(1, 1, length(gnatsgo))), 1:2] } 121.5565 134.4841 135.2277 137.6504 139.2550 141.0407 10 a
	# { c <- gnatsgo[round(runif(2, 1, length(gnatsgo))), 1] } 117.7989 137.6253 140.2133 138.9760 151.9303 156.8037 10 ab
	# { d <- gnatsgo[round(runif(2, 1, length(gnatsgo))), 1:2] } 132.5007 135.7209 149.1425 154.1004 155.4409 161.5209 10 b


	# open-ended j indexing (generally a pretty slow operation)
	microbenchmark::microbenchmark({foo <- gnatsgo[,1]
	foo <- NULL},
	{foo <- gnatsgo[,2:3]
	foo <- NULL},
	times = 5)
	# first horizon or second and third horizon, all profiles
	# Unit: seconds
	# expr min lq mean median uq max neval cld
	# { foo <- gnatsgo[, 1] foo <- NULL } 14.97619 15.46312 15.78757 16.09652 16.13384 16.26816 5 a
	# { foo <- gnatsgo[, 2:3] foo <- NULL } 18.65325 18.68035 18.85315 18.83340 19.03189 19.06688 5 b