scbrown86/hv_auc_boyce_background.R

## hv_auc_boyce_background.R
##%######################################################%##
#                                                          #
####           Extracts probability estimates           ####
####         from calibration/validation points         ####
####          for n-dimensional hypervolumes,           ####
####               and compares these to                ####
####    background values from the same hypervolume.    ####
####         Values are then rescaled to {0, 1}         ####
####          based on 10% omission rate, with          ####
####        upper values capped at 90%. AUC and         ####
####   Boyce indices are then calculated on rescaled    ####
####         data.  Stuart C Brown - 2021-10-18         ####
#                                                          #
##%######################################################%##

# function for background probability
## essentially the same as hypervolume::hypervolume_estimate_probability()
## but removes rows/weights where distances are 0.
## This is necessary because randomly sampled background points will have dist of 0 to itself
## which then becomes Inf when weighted
## need to provide n.points and a random.seed for sampling background points
background_probability <- function(hv, n.points = 10000, weight.exponent = -1,
                                   random.seed = 65, set.edges.zero = TRUE,
                                   edges.zero.distance.factor = 1, verbose = TRUE) {
  require(pbapply)
  np <- nrow(hv@RandomPoints)
  dimhv <- ncol(hv@RandomPoints)
  numpointstokeep_hv <- floor(min(c(n.points, np)))
  if (numpointstokeep_hv != n.points & verbose) {
    warning(sprintf("There were less than %s background points available. Using %s points. See hv@RandomPoints.", n.points, np),
            call. = FALSE, immediate. = TRUE, noBreaks. = TRUE)
  }
  ## set random seed for reproducibility
  ## only an issue of n.points is less than the number of random points in the hv
  {set.seed(random.seed); index_ss <- sample(1:np, numpointstokeep_hv, replace = FALSE)}
  hv_points_ss <- hv@RandomPoints[index_ss, , drop = FALSE]
  prob_ss <- hv@ValueAtRandomPoints[index_ss]
  point_density <- nrow(hv_points_ss)/hv@Volume
  cutoff_dist <- point_density^(-1/dimhv)
  if (verbose) {
    cat(sprintf("Retaining %d/%d hypervolume random points for comparison with %d test points.\n",
                nrow(hv_points_ss), nrow(hv@RandomPoints), np))
  }
  if (!verbose) {
    pbapply::pboptions(type = "none")
  } else {
    pbapply::pboptions(type = "timer")
  }
  points <- as.matrix(hv_points_ss)
  probabilities <- pbsapply(1:nrow(points), function(i) {
    distances <- pdist::pdist(points[i, , drop = FALSE], hv_points_ss)@dist
    # get indices for distance == 0. Self-distance.
    dist0 <- which(distances == 0)
    weights <- distances^weight.exponent
    # exlude dist == 0 from results. Otherwise Inf.
    result <- sum(prob_ss[-dist0] * weights[-dist0])/sum(weights[-dist0])
    if (set.edges.zero == TRUE) {
      if (min(distances) > cutoff_dist * edges.zero.distance.factor) {
        result <- 0
      }
    }
    return(result)
  }, simplify = TRUE, USE.NAMES = FALSE)
  return(probabilities)
}

TEST <- FALSE
if (TEST) {
  library(data.table)
  library(hypervolume)
  library(scales)
  library(raster)
  library(rnaturalearth)
  ## enmSdm must must be installed from github - see below
  if (!require(enmSdm)) {
    library(remotes)
    Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS = "true")
    install_github("adamlilith/enmSdm", dependencies = TRUE)
    install_github("adamlilith/omnibus")
    library(enmSdm)
  } else {
    library(enmSdm)
  }
  # load in lat/lon data
  data("quercus")
  data_alba <- subset(quercus, Species == "Quercus alba")[, c("Longitude","Latitude")]
  # get worldclim data from internet
  climatelayers <- getData("worldclim", var = "bio", res = 10)
  # z-transform climate layers to make axes comparable
  climatelayers_ss <- climatelayers[[c(1,4,12,15)]]
  climatelayers_ss <- scale(climatelayers_ss, center = TRUE, scale = TRUE)
  climatelayers_ss_cropped <- aggregate(crop(climatelayers_ss, extent(-150,-50,15,60)), 3)
  # extract transformed climate values
  climate_alba <- extract(climatelayers_ss_cropped, data_alba)
  {set.seed(54); valIDX <- sort(sample(1:nrow(climate_alba), 0.1 * nrow(climate_alba), replace = FALSE))}
  # calibration set
  climate_alba_cal <- climate_alba[-valIDX, ]
  # validation set
  climate_alba_val <- climate_alba[valIDX, ]
  # compute Ggaussian hypervolume.
  hv_alba <- hypervolume_gaussian(climate_alba_cal, name = "alba", samples.per.point = 10)
  hv_alba
  plot(hv_alba,
       plot.function.additional = function(i, j) {
         points(x = climate_alba_val[, i], y = climate_alba_val[, j],
                col = "blue2", pch = 16)},
       show.legend = FALSE)
  # % of val points in hv
  round(sum(hypervolume_inclusion_test(hv_alba, points = climate_alba_val, verbose = FALSE,
                                       fast.or.accurate = "accurate"))/nrow(climate_alba_val), 2)*100
  # probabilities
  cal_hs <- hypervolume_estimate_probability(hv_alba, climate_alba_cal) # calibration points
  val_hs <- hypervolume_estimate_probability(hv_alba, climate_alba_val) # validation points
  # extract values from random points (equivalent to background points)
  ## see ?`Hypervolume-class` for details
  ## 10,000 or total (smaller or two) of background points sampled
  bg_scores <- background_probability(hv_alba, n.points = 10000, random.seed = 20211018,
                                      verbose = TRUE)
  {par(mfrow = c(2,2))
    hist(cal_hs, breaks = 20)
    abline(v = quantile(cal_hs, probs = c(0.10, 0.90)), col = "red", lty = 2)
    hist(val_hs, breaks = 20)
    abline(v = quantile(cal_hs, probs = c(0.10, 0.90)), col = "red", lty = 2)
    hist(bg_scores, breaks = 20)
    abline(v = quantile(cal_hs, probs = c(0.10, 0.90)), col = "red", lty = 2)
    text(mean(cal_hs), 450, "red lines show\np10 and p90\nof cal_hs")
    par(mfrow = c(1,1))}
  # Apply p10 threshold to exlude 10% of training presences
  ## p10 is defined from calibration data
  p10 <- quantile(cal_hs, 0.10)
  p90 <- quantile(cal_hs, 0.90)
  # Threshold. Anything <= p10 becomes NA, anything > p90 becomes p90
  ## thresholded scores are then rescaled to {0, 1}
  cal_hs[cal_hs <= p10] <- NA; cal_hs[cal_hs > p90] <- p90; cal_hs <- rescale(cal_hs, from = c(p10, p90), to = c(0, 1))
  val_hs[val_hs <= p10] <- NA; val_hs[val_hs > p90] <- p90; val_hs <- rescale(val_hs, from = c(p10, p90), to = c(0, 1))
  bg_scores[bg_scores <= p10] <- NA; bg_scores[bg_scores > p90] <- p90; bg_scores <- rescale(bg_scores, from = c(p10, p90), to = c(0, 1))
  {par(mfrow = c(2,2))
    hist(cal_hs, breaks = 20)
    hist(val_hs, breaks = 20)
    hist(bg_scores, breaks = 20)
    par(mfrow = c(1,1))}
  # AUC
  aucWeighted(cal_hs, bg_scores, na.rm = TRUE)
  aucWeighted(val_hs, bg_scores, na.rm = TRUE)
  # Boyce
  contBoyce(cal_hs, bg_scores, na.rm = TRUE, binWidth = 0.1)
  contBoyce(val_hs, bg_scores, na.rm = TRUE, binWidth = 0.1)
  contBoyce2x(cal_hs, bg_scores, na.rm = TRUE)
  contBoyce2x(val_hs, bg_scores, na.rm = TRUE)
  # Spatial projection
  alba_map <- hypervolume_project(hv_alba, climatelayers_ss_cropped, set.edges.zero = TRUE)
  plot(alba_map, col = hcl.colors(100, "Green-Yellow", rev = TRUE),
       legend = TRUE, main = "Quercus alba [raw scale]",
       sub = "Calibration points in blue; validation points in red",
       addfun = function() {
         lines(ne_coastline(110, "sp"))
         points(data_alba[-valIDX, ], cex = 0.2, pch = 16, col = "#0000F068")
         points(data_alba[valIDX, ], cex = 0.2, pch = 16, col = "#FF30305F")
       })
  # Thresholded spatial projection
  ## p10 is very conservative
  alba_map_thr <- alba_map
  alba_map_thr[alba_map_thr <= p10] <- NA
  alba_map_thr[alba_map_thr > p90] <- p90
  values(alba_map_thr) <- rescale(values(alba_map_thr), from = c(p10, p90), to = c(0, 1))
  plot(alba_map_thr, col = hcl.colors(100, "Green-Yellow", rev = TRUE),
       legend = TRUE, main = "Quercus alba [rescaled]",
       sub = "Calibration points in blue; validation points in red",
       zlim = c(0, 1),
       addfun = function() {
         lines(ne_coastline(110, "sp"))
         points(data_alba[-valIDX, ], cex = 0.2, pch = 16, col = "#0000F068")
         points(data_alba[valIDX, ], cex = 0.2, pch = 16, col = "#FF30305F")
       })
  # Thresholded spatial projection
  ## use minimum training presence
  alba_map_mtp <- alba_map
  mtp <- min(extract(alba_map, data_alba[-valIDX, ]), na.rm = TRUE)
  maxtp <- max(values(alba_map_mtp), na.rm = TRUE)
  alba_map_mtp[alba_map_mtp <= mtp] <- NA
  alba_map_mtp <- raster::stretch(alba_map_mtp, minv = 0, maxv = 1, smin = mtp, smax = maxtp)
  plot(alba_map_mtp, col = hcl.colors(100, "Green-Yellow", rev = TRUE),
       legend = TRUE, main = "Quercus alba [rescaled mtp]",
       sub = "Calibration points in blue; validation points in red",
       zlim = c(0, 1),
       addfun = function() {
         lines(ne_coastline(110, "sp"))
         points(data_alba[-valIDX, ], cex = 0.2, pch = 16, col = "#0000F068")
         points(data_alba[valIDX, ], cex = 0.2, pch = 16, col = "#FF30305F")
       })
  # Thresholded spatial projection
  ## use 9th percentile of all values > 0 as upper bound. 0 as lower.
  alba_map_q95 <- alba_map
  q95 <- quantile(alba_map_q95[alba_map_q95 > 0], 0.95, na.rm = TRUE)
  alba_map_q95 <- raster::stretch(alba_map_q95, minv = 0, maxv = 1, smin = 0, smax = q95)
  plot(alba_map_q95, col = hcl.colors(100, "Green-Yellow", rev = TRUE),
       legend = TRUE, main = "Quercus alba [rescaled q95]",
       sub = "Calibration points in blue; validation points in red",
       zlim = c(0, 1),
       addfun = function() {
         lines(ne_coastline(110, "sp"))
         points(data_alba[-valIDX, ], cex = 0.2, pch = 16, col = "#0000F068")
         points(data_alba[valIDX, ], cex = 0.2, pch = 16, col = "#FF30305F")
       })
  {par(mfrow = c(2,2))
    hist(extract(alba_map_thr, data_alba), breaks = 20, xlim = c(0, 1), main = "p10/p90")
    hist(extract(alba_map_mtp, data_alba), breaks = 20, xlim = c(0, 1), main = "mtp")
    hist(extract(alba_map_q95, data_alba), breaks = 20, xlim = c(0, 1), main = "q95")
    par(mfrow = c(1,1))}
}
	##%######################################################%##
	# #
	#### Extracts probability estimates ####
	#### from calibration/validation points ####
	#### for n-dimensional hypervolumes, ####
	#### and compares these to ####
	#### background values from the same hypervolume. ####
	#### Values are then rescaled to {0, 1} ####
	#### based on 10% omission rate, with ####
	#### upper values capped at 90%. AUC and ####
	#### Boyce indices are then calculated on rescaled ####
	#### data. Stuart C Brown - 2021-10-18 ####
	# #
	##%######################################################%##

	# function for background probability
	## essentially the same as hypervolume::hypervolume_estimate_probability()
	## but removes rows/weights where distances are 0.
	## This is necessary because randomly sampled background points will have dist of 0 to itself
	## which then becomes Inf when weighted
	## need to provide n.points and a random.seed for sampling background points
	background_probability <- function(hv, n.points = 10000, weight.exponent = -1,
	random.seed = 65, set.edges.zero = TRUE,
	edges.zero.distance.factor = 1, verbose = TRUE) {
	require(pbapply)
	np <- nrow(hv@RandomPoints)
	dimhv <- ncol(hv@RandomPoints)
	numpointstokeep_hv <- floor(min(c(n.points, np)))
	if (numpointstokeep_hv != n.points & verbose) {
	warning(sprintf("There were less than %s background points available. Using %s points. See hv@RandomPoints.", n.points, np),
	call. = FALSE, immediate. = TRUE, noBreaks. = TRUE)
	}
	## set random seed for reproducibility
	## only an issue of n.points is less than the number of random points in the hv
	{set.seed(random.seed); index_ss <- sample(1:np, numpointstokeep_hv, replace = FALSE)}
	hv_points_ss <- hv@RandomPoints[index_ss, , drop = FALSE]
	prob_ss <- hv@ValueAtRandomPoints[index_ss]
	point_density <- nrow(hv_points_ss)/hv@Volume
	cutoff_dist <- point_density^(-1/dimhv)
	if (verbose) {
	cat(sprintf("Retaining %d/%d hypervolume random points for comparison with %d test points.\n",
	nrow(hv_points_ss), nrow(hv@RandomPoints), np))
	}
	if (!verbose) {
	pbapply::pboptions(type = "none")
	} else {
	pbapply::pboptions(type = "timer")
	}
	points <- as.matrix(hv_points_ss)
	probabilities <- pbsapply(1:nrow(points), function(i) {
	distances <- pdist::pdist(points[i, , drop = FALSE], hv_points_ss)@dist
	# get indices for distance == 0. Self-distance.
	dist0 <- which(distances == 0)
	weights <- distances^weight.exponent
	# exlude dist == 0 from results. Otherwise Inf.
	result <- sum(prob_ss[-dist0] * weights[-dist0])/sum(weights[-dist0])
	if (set.edges.zero == TRUE) {
	if (min(distances) > cutoff_dist * edges.zero.distance.factor) {
	result <- 0
	}
	}
	return(result)
	}, simplify = TRUE, USE.NAMES = FALSE)
	return(probabilities)
	}

	TEST <- FALSE
	if (TEST) {
	library(data.table)
	library(hypervolume)
	library(scales)
	library(raster)
	library(rnaturalearth)
	## enmSdm must must be installed from github - see below
	if (!require(enmSdm)) {
	library(remotes)
	Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS = "true")
	install_github("adamlilith/enmSdm", dependencies = TRUE)
	install_github("adamlilith/omnibus")
	library(enmSdm)
	} else {
	library(enmSdm)
	}
	# load in lat/lon data
	data("quercus")
	data_alba <- subset(quercus, Species == "Quercus alba")[, c("Longitude","Latitude")]
	# get worldclim data from internet
	climatelayers <- getData("worldclim", var = "bio", res = 10)
	# z-transform climate layers to make axes comparable
	climatelayers_ss <- climatelayers[[c(1,4,12,15)]]
	climatelayers_ss <- scale(climatelayers_ss, center = TRUE, scale = TRUE)
	climatelayers_ss_cropped <- aggregate(crop(climatelayers_ss, extent(-150,-50,15,60)), 3)
	# extract transformed climate values
	climate_alba <- extract(climatelayers_ss_cropped, data_alba)
	{set.seed(54); valIDX <- sort(sample(1:nrow(climate_alba), 0.1 * nrow(climate_alba), replace = FALSE))}
	# calibration set
	climate_alba_cal <- climate_alba[-valIDX, ]
	# validation set
	climate_alba_val <- climate_alba[valIDX, ]
	# compute Ggaussian hypervolume.
	hv_alba <- hypervolume_gaussian(climate_alba_cal, name = "alba", samples.per.point = 10)
	hv_alba
	plot(hv_alba,
	plot.function.additional = function(i, j) {
	points(x = climate_alba_val[, i], y = climate_alba_val[, j],
	col = "blue2", pch = 16)},
	show.legend = FALSE)
	# % of val points in hv
	round(sum(hypervolume_inclusion_test(hv_alba, points = climate_alba_val, verbose = FALSE,
	fast.or.accurate = "accurate"))/nrow(climate_alba_val), 2)*100
	# probabilities
	cal_hs <- hypervolume_estimate_probability(hv_alba, climate_alba_cal) # calibration points
	val_hs <- hypervolume_estimate_probability(hv_alba, climate_alba_val) # validation points
	# extract values from random points (equivalent to background points)
	## see ?`Hypervolume-class` for details
	## 10,000 or total (smaller or two) of background points sampled
	bg_scores <- background_probability(hv_alba, n.points = 10000, random.seed = 20211018,
	verbose = TRUE)
	{par(mfrow = c(2,2))
	hist(cal_hs, breaks = 20)
	abline(v = quantile(cal_hs, probs = c(0.10, 0.90)), col = "red", lty = 2)
	hist(val_hs, breaks = 20)
	abline(v = quantile(cal_hs, probs = c(0.10, 0.90)), col = "red", lty = 2)
	hist(bg_scores, breaks = 20)
	abline(v = quantile(cal_hs, probs = c(0.10, 0.90)), col = "red", lty = 2)
	text(mean(cal_hs), 450, "red lines show\np10 and p90\nof cal_hs")
	par(mfrow = c(1,1))}
	# Apply p10 threshold to exlude 10% of training presences
	## p10 is defined from calibration data
	p10 <- quantile(cal_hs, 0.10)
	p90 <- quantile(cal_hs, 0.90)
	# Threshold. Anything <= p10 becomes NA, anything > p90 becomes p90
	## thresholded scores are then rescaled to {0, 1}
	cal_hs[cal_hs <= p10] <- NA; cal_hs[cal_hs > p90] <- p90; cal_hs <- rescale(cal_hs, from = c(p10, p90), to = c(0, 1))
	val_hs[val_hs <= p10] <- NA; val_hs[val_hs > p90] <- p90; val_hs <- rescale(val_hs, from = c(p10, p90), to = c(0, 1))
	bg_scores[bg_scores <= p10] <- NA; bg_scores[bg_scores > p90] <- p90; bg_scores <- rescale(bg_scores, from = c(p10, p90), to = c(0, 1))
	{par(mfrow = c(2,2))
	hist(cal_hs, breaks = 20)
	hist(val_hs, breaks = 20)
	hist(bg_scores, breaks = 20)
	par(mfrow = c(1,1))}
	# AUC
	aucWeighted(cal_hs, bg_scores, na.rm = TRUE)
	aucWeighted(val_hs, bg_scores, na.rm = TRUE)
	# Boyce
	contBoyce(cal_hs, bg_scores, na.rm = TRUE, binWidth = 0.1)
	contBoyce(val_hs, bg_scores, na.rm = TRUE, binWidth = 0.1)
	contBoyce2x(cal_hs, bg_scores, na.rm = TRUE)
	contBoyce2x(val_hs, bg_scores, na.rm = TRUE)
	# Spatial projection
	alba_map <- hypervolume_project(hv_alba, climatelayers_ss_cropped, set.edges.zero = TRUE)
	plot(alba_map, col = hcl.colors(100, "Green-Yellow", rev = TRUE),
	legend = TRUE, main = "Quercus alba [raw scale]",
	sub = "Calibration points in blue; validation points in red",
	addfun = function() {
	lines(ne_coastline(110, "sp"))
	points(data_alba[-valIDX, ], cex = 0.2, pch = 16, col = "#0000F068")
	points(data_alba[valIDX, ], cex = 0.2, pch = 16, col = "#FF30305F")
	})
	# Thresholded spatial projection
	## p10 is very conservative
	alba_map_thr <- alba_map
	alba_map_thr[alba_map_thr <= p10] <- NA
	alba_map_thr[alba_map_thr > p90] <- p90
	values(alba_map_thr) <- rescale(values(alba_map_thr), from = c(p10, p90), to = c(0, 1))
	plot(alba_map_thr, col = hcl.colors(100, "Green-Yellow", rev = TRUE),
	legend = TRUE, main = "Quercus alba [rescaled]",
	sub = "Calibration points in blue; validation points in red",
	zlim = c(0, 1),
	addfun = function() {
	lines(ne_coastline(110, "sp"))
	points(data_alba[-valIDX, ], cex = 0.2, pch = 16, col = "#0000F068")
	points(data_alba[valIDX, ], cex = 0.2, pch = 16, col = "#FF30305F")
	})
	# Thresholded spatial projection
	## use minimum training presence
	alba_map_mtp <- alba_map
	mtp <- min(extract(alba_map, data_alba[-valIDX, ]), na.rm = TRUE)
	maxtp <- max(values(alba_map_mtp), na.rm = TRUE)
	alba_map_mtp[alba_map_mtp <= mtp] <- NA
	alba_map_mtp <- raster::stretch(alba_map_mtp, minv = 0, maxv = 1, smin = mtp, smax = maxtp)
	plot(alba_map_mtp, col = hcl.colors(100, "Green-Yellow", rev = TRUE),
	legend = TRUE, main = "Quercus alba [rescaled mtp]",
	sub = "Calibration points in blue; validation points in red",
	zlim = c(0, 1),
	addfun = function() {
	lines(ne_coastline(110, "sp"))
	points(data_alba[-valIDX, ], cex = 0.2, pch = 16, col = "#0000F068")
	points(data_alba[valIDX, ], cex = 0.2, pch = 16, col = "#FF30305F")
	})
	# Thresholded spatial projection
	## use 9th percentile of all values > 0 as upper bound. 0 as lower.
	alba_map_q95 <- alba_map
	q95 <- quantile(alba_map_q95[alba_map_q95 > 0], 0.95, na.rm = TRUE)
	alba_map_q95 <- raster::stretch(alba_map_q95, minv = 0, maxv = 1, smin = 0, smax = q95)
	plot(alba_map_q95, col = hcl.colors(100, "Green-Yellow", rev = TRUE),
	legend = TRUE, main = "Quercus alba [rescaled q95]",
	sub = "Calibration points in blue; validation points in red",
	zlim = c(0, 1),
	addfun = function() {
	lines(ne_coastline(110, "sp"))
	points(data_alba[-valIDX, ], cex = 0.2, pch = 16, col = "#0000F068")
	points(data_alba[valIDX, ], cex = 0.2, pch = 16, col = "#FF30305F")
	})
	{par(mfrow = c(2,2))
	hist(extract(alba_map_thr, data_alba), breaks = 20, xlim = c(0, 1), main = "p10/p90")
	hist(extract(alba_map_mtp, data_alba), breaks = 20, xlim = c(0, 1), main = "mtp")
	hist(extract(alba_map_q95, data_alba), breaks = 20, xlim = c(0, 1), main = "q95")
	par(mfrow = c(1,1))}
	}