goldingn/zoon_sketch.R

## zoon_sketch.R
# _____________     ____________       ____________         ___________________
#  \  ______   \__ _\   ______  \     _\   ______  \       _\      ___  \______
#   \       /   \  \ \  \     \  \__ _\ \  \     \  \     _\ \  \/    \  \_____
#    \     /     \  \ \  \     \  \  \ \ \  \     \  \__ _\ \ \  \     \  \____
#     \   /______ \  \ \  \_____\  \  \ \ \  \_____\  \  \ \ \ \  \     \  \___
#      \___________\__\_\___________\__\_\_\___________\__\_\_\_\___________\__
#                      \____________\___\_\____________\___\_\_\____________\__
#                                        \____________\_____\_\____________\___
# SPECIES DISTRIBUTION MODELLING                             \____________\____
#
# A PROTOTYPE FOR REPRODUCIBLE, ACCESSIBLE & SHAREABLE SCIENTIFIC OUTPUTS IN R

# (Greg, I edited your lovely ASCII art to bring it under the 80 character
# line limit for the Google R style guide)

# This is a *really* rough sketch of a modular structure for the ZOON workflow.
# In this version, a workflow consists of 5 types of modules, representing the
# key steps in an SDM workflow:
#
#   occurrence module
#   covariate module
#   process module
#   model module
#   map module
#
# The user must select one module for each of these module types. These are
# then passed to the workflow wrapper function which runs through each step.
# Every module type should have a protocol determining what it takes as input
# and what it gives as output. Different modules for each module type could
# be stored in a community repositories, enabling users to quickly drop in a
# newly developed model or processing step and run it with their data. The
# structure also makes it very easy to compare different models against a
# standard dataset.
# These modules are just functions and wherever possible they should be self-
# contained, taking no arguments other than those used in the workflow.
#
# Clearly this is very simplistic, obvious next steps include modules for
# validation statistics and plots, the ability to predict to different
# covariate rasters than are used to train the model, etc.
#
# OK, let's go....

# ~~~~~~~~~~~~
# clear the workspace
rm(list = ls())

# ~~~~~~~~~~~~
# define the modules fir each module type

# ~~~~~~~~~~~~
# occurrence modules

# input: a numeric vetor of length 4 giving the coordinates of the rectangular
# region within which to carry out the analysis, in the order: xmin, xmax,
# ymin, ymax.

# output: a dataframe with four columns:
#   value - a numeric value which may give 1 for presences, 0 for absences or a
#     positive integer for count data
#   type - a character value saying what is in the value column
#   lon - the longitude of the record
#   lat - the latitutude of the record

# ~~~

# occurrenceCp:
# occurrence module to grab *Culex pipiens* (a mosquito) occurrence (i.e.
# presence-only) data from GBIF, in the area bounded by extent.
# Perhaps this should have temporal interval too for future-proofing?

occurrenceCp <- function (extent) {
  require (dismo)

  raw <- gbif(genus = 'Culex',
              species = 'pipiens',
              ext = extent)

  occurrence <- raw[, c('lon', 'lat')]

  occurrence$value <- 1

  occurrence$type <- 'presence'

  return(occurrence)
}

# ~~~~~~~~~~~~
# covariate modules

# input: a numeric vetor of length 4 giving the coordinates of the rectangular
# region within which to carry out the analysis, in the order: xmin, xmax,
# ymin, ymax.

# output: a Raster* object (class from the raster package) with the gridded
# covariates used to train and predict from the SDM.

# ~~~

# covariateAir:
# covariate module to grab a coarse resolution mean air temperature raster from
# January-February 2001-2002 for the given extent.

covariateAir <- function (extent) {
  require(RNCEP)

  c1 <- NCEP.gather(variable = 'air',
                    level = 850,
                    months.minmax = c(1:2),
                    years.minmax = c(2000,2001),
                    lat.southnorth = extent[3:4],
                    lon.westeast = extent[1:2],
                    reanalysis2 = FALSE,
                    return.units = TRUE)

  avg <- apply(c1, c(1, 2), mean)

  ras <- raster(avg)

  extent(ras) <- c(extent)

  return (ras)

}

# ~~~~~~~~~~~~
# process modules

# input:
#   occ - occurrence data, the output from an occurrence module
#   ras - covariate data, the output from a covariate module

# output: dataframe with at least 5 columns
#   value - a numeric value which may give 1 for presences, 0 for absences or a
#     positive integer for count data
#   type - a character value saying what is in the value column
#   lon - the longitude of the record
#   lat - the latitutude of the record
#   columns 5-n - the values of the covariates for each records (the names of
#       these columns should correspond exactly to the names of the layers in
#       ras)

# ~~~

# processA:
# process module to generate up to 100 background records at random in
# cells of ras and return these along with the ppresence only data.

processA <- function (occ, ras) {

  require (dismo)

  if (!all(occ$type == 'presence')) {
    stop ('this function only works for presence-only data')
  }

  # generate pseudo-absence data
  pa <- randomPoints(ras,
                     100)


  npres <- nrow(occ)

  npabs <- nrow(pa)

  # extract covariates
  occ_covs <- as.matrix(extract(ras, occ[, c('lon', 'lat')]))

  pa_covs <- as.matrix(extract(ras, pa))

  covs <- rbind(occ_covs,
                pa_covs)

  # combine with the occurrence data
  df <- data.frame(value = rep(c(1, 0),
                               c(npres, npabs)),
                   type = rep(c('presence', 'background'),
                              c(npres, npabs)),
                   lon = c(occ$lon, pa[, 1]),
                   lat = c(occ$lat, pa[, 2]),
                   covs)

  names(df)[5:ncol(df)] <- names(ras)

  return(df)

}

# ~~~~~~~~~~~~
# model modules

# input:
#   df - a dataframe, the output from a process module

# output: a model object with a valid predict method

# note that the current set up only works for models with a predict method
# which takes the argument: type = 'response'
# obviously we'll have to work around that.

# ~~~

# modelLR:
# model module to fit a simple logistic regression model

modelLR <- function (df) {
  if (!all(df$type %in% c('presence', 'absence', 'background'))) {
    stop ('only for presence/absence or presence/background data')
  }

  covs <- as.data.frame(df[, 5:ncol(df)])
  names(covs) <- names(df)[5:ncol(df)]
  m <- glm(df$value ~ .,
           data = covs,
           family = binomial)

  return (m)
}

# modelRF:
# model module to fit a simple RandomForest classification model

modelRF <- function (df) {

  require ('randomForest')

  if (!all(df$type %in% c('presence', 'absence', 'background'))) {
    stop ('only for presence/absence or presence/background data')
  }

  covs <- as.data.frame(df[, 5:ncol(df)])
  names(covs) <- names(df)[5:ncol(df)]
  m <- randomForest(df$value ~ .,
                    data = covs,
                    weights = rep(1, nrow(covs)),
                    size = 1)

  return (m)
}

# ~~~~~~~~~~~~
# map modules

# input:
#   model - a model object, the output from a model module
#   ras - a Raster* object, the output from a covariate module

# output: a Raster object giving the probabilistic model predictions for each
# cell of ras

mapA <- function (model, ras) {

  vals <- data.frame(getValues(ras))
  colnames(vals) <- names(ras)

  pred <- predict(model,
                  newdata = vals,
                  type = 'response')

  pred_ras <- ras[[1]]

  pred_ras <- setValues(pred_ras, pred)

  return(pred_ras)

}

# ~~~~~~~~~~~~
# workflow wrapper function

# input: an extent and one of each of the modules
# output: a list with the extent, the results of each module and a copy of the
# code used to execute the workflow (what's there now should be source-able
# though I'm sure there is a much neater approach than the one I took - the
# ultimate aim would be a much nicer way of enhancing reproducibility).

workflow <- function(ext,
                     occurrenceFn,
                     covariateFn,
                     processFn,
                     modelFn,
                     mapFn) {

  # small helper function to dump the code for a function to a text string
  getSource <- function (object) {
    paste(deparse(object), collapse = '\n')
  }

  occ <- occurrenceFn(ext)
  cov <- covariateFn(ext)
  df <- processFn(occ, cov)
  m <- modelFn(df)
  map <- mapFn(m, cov)

  # get the command used to call this function
  bits <- sys.call()
  call <- paste0(bits[1],
                 '(',
                 paste(bits[-1],
                       collapse = ', '),
                 ')')

  # sorry about this spaghetti...
  workflow <- paste0(paste(bits[-2],
                           c(getSource(workflow),
                             getSource(occurrenceFn),
                             getSource(covariateFn),
                             getSource(processFn),
                             getSource(modelFn),
                             getSource(mapFn)),
                           sep = ' <- ',
                           collapse = '\n\n'),
                     paste0('\n\n',
                            bits[2],
                            ' <- c(',
                            paste(ext, collapse = ', '),
                            ')\n'),
                     '\nans <- ',
                     call,
                     collapse = '\n\n\n')

  return(list(extent = extent,
              occ = occ,
              cov = cov,
              df = df,
              m = m,
              map = map,
              workflow = workflow))
}


# ~~~~~~~~~~~~~~
# test drive it!

# define the extent in lat and long
UKextent <- c(xmin = -10,
              xmax = 10,
              ymin = 45,
              ymax = 65)

# run a workflow, using the logistic regression model
ans1 <- workflow(ext = UKextent,
                 occurrenceFn = occurrenceCp,
                 covariateFn = covariateAir,
                 processFn = processA,
                 modelFn = modelLR,
                 mapFn = mapA)

# switch the model to a RandomForest
ans2 <- workflow(ext = UKextent,
                 occurrenceFn = occurrenceCp,
                 covariateFn = covariateAir,
                 processFn = processA,
                 modelFn = modelRF,
                 mapFn = mapA)

# look at the contents of these lists
str(ans1, 1)
str(ans2, 1)

# plot the resulting maps
par(mfrow = c(1, 2))

plot(ans1$map,
     zlim = c(0,1),
     main = 'LR')

points(ans1$occ[, 1:2],
       pch = 16,
       cex = 0.3)

plot(ans2$map,
     zlim = c(0,1),
     main = 'RF')

points(ans2$occ[, 1:2],
       pch = 16,
       cex = 0.3)

# ~~~~~~~~~~~~~~~
# test run the script it outputted

# save the workflow script somewhere
write(ans1$workflow, file = '~/Desktop/ans1_code.txt')

# clear everything in the workspace
rm(list = ls())

# call the script!
source('~/Desktop/ans1_code.txt')

# plot the first map again
par(mfrow = c(1, 1))

plot(ans$map,
     zlim = c(0,1),
     main = 'LR')

points(ans$occ[, 1:2],
       pch = 16,
       cex = 0.3)
	# _____________ ____________ ____________ ___________________
	# \ ______ \__ _\ ______ \ _\ ______ \ _\ ___ \______
	# \ / \ \ \ \ \ \__ _\ \ \ \ \ _\ \ \/ \ \_____
	# \ / \ \ \ \ \ \ \ \ \ \ \ \__ _\ \ \ \ \ \____
	# \ /______ \ \ \ \_____\ \ \ \ \ \_____\ \ \ \ \ \ \ \ \___
	# \___________\__\_\___________\__\_\_\___________\__\_\_\_\___________\__
	# \____________\___\_\____________\___\_\_\____________\__
	# \____________\_____\_\____________\___
	# SPECIES DISTRIBUTION MODELLING \____________\____
	#
	# A PROTOTYPE FOR REPRODUCIBLE, ACCESSIBLE & SHAREABLE SCIENTIFIC OUTPUTS IN R

	# (Greg, I edited your lovely ASCII art to bring it under the 80 character
	# line limit for the Google R style guide)

	# This is a really rough sketch of a modular structure for the ZOON workflow.
	# In this version, a workflow consists of 5 types of modules, representing the
	# key steps in an SDM workflow:
	#
	# occurrence module
	# covariate module
	# process module
	# model module
	# map module
	#
	# The user must select one module for each of these module types. These are
	# then passed to the workflow wrapper function which runs through each step.
	# Every module type should have a protocol determining what it takes as input
	# and what it gives as output. Different modules for each module type could
	# be stored in a community repositories, enabling users to quickly drop in a
	# newly developed model or processing step and run it with their data. The
	# structure also makes it very easy to compare different models against a
	# standard dataset.
	# These modules are just functions and wherever possible they should be self-
	# contained, taking no arguments other than those used in the workflow.
	#
	# Clearly this is very simplistic, obvious next steps include modules for
	# validation statistics and plots, the ability to predict to different
	# covariate rasters than are used to train the model, etc.
	#
	# OK, let's go....

	# ~~~~~~~~~~~~
	# clear the workspace
	rm(list = ls())

	# ~~~~~~~~~~~~
	# define the modules fir each module type

	# ~~~~~~~~~~~~
	# occurrence modules

	# input: a numeric vetor of length 4 giving the coordinates of the rectangular
	# region within which to carry out the analysis, in the order: xmin, xmax,
	# ymin, ymax.

	# output: a dataframe with four columns:
	# value - a numeric value which may give 1 for presences, 0 for absences or a
	# positive integer for count data
	# type - a character value saying what is in the value column
	# lon - the longitude of the record
	# lat - the latitutude of the record

	# ~~~

	# occurrenceCp:
	# occurrence module to grab Culex pipiens (a mosquito) occurrence (i.e.
	# presence-only) data from GBIF, in the area bounded by extent.
	# Perhaps this should have temporal interval too for future-proofing?

	occurrenceCp <- function (extent) {
	require (dismo)

	raw <- gbif(genus = 'Culex',
	species = 'pipiens',
	ext = extent)

	occurrence <- raw[, c('lon', 'lat')]

	occurrence$value <- 1

	occurrence$type <- 'presence'

	return(occurrence)
	}

	# ~~~~~~~~~~~~
	# covariate modules

	# input: a numeric vetor of length 4 giving the coordinates of the rectangular
	# region within which to carry out the analysis, in the order: xmin, xmax,
	# ymin, ymax.

	# output: a Raster* object (class from the raster package) with the gridded
	# covariates used to train and predict from the SDM.

	# ~~~

	# covariateAir:
	# covariate module to grab a coarse resolution mean air temperature raster from
	# January-February 2001-2002 for the given extent.

	covariateAir <- function (extent) {
	require(RNCEP)

	c1 <- NCEP.gather(variable = 'air',
	level = 850,
	months.minmax = c(1:2),
	years.minmax = c(2000,2001),
	lat.southnorth = extent[3:4],
	lon.westeast = extent[1:2],
	reanalysis2 = FALSE,
	return.units = TRUE)

	avg <- apply(c1, c(1, 2), mean)

	ras <- raster(avg)

	extent(ras) <- c(extent)

	return (ras)

	}

	# ~~~~~~~~~~~~
	# process modules

	# input:
	# occ - occurrence data, the output from an occurrence module
	# ras - covariate data, the output from a covariate module

	# output: dataframe with at least 5 columns
	# value - a numeric value which may give 1 for presences, 0 for absences or a
	# positive integer for count data
	# type - a character value saying what is in the value column
	# lon - the longitude of the record
	# lat - the latitutude of the record
	# columns 5-n - the values of the covariates for each records (the names of
	# these columns should correspond exactly to the names of the layers in
	# ras)

	# ~~~

	# processA:
	# process module to generate up to 100 background records at random in
	# cells of ras and return these along with the ppresence only data.

	processA <- function (occ, ras) {

	require (dismo)

	if (!all(occ$type == 'presence')) {
	stop ('this function only works for presence-only data')
	}

	# generate pseudo-absence data
	pa <- randomPoints(ras,
	100)


	npres <- nrow(occ)

	npabs <- nrow(pa)

	# extract covariates
	occ_covs <- as.matrix(extract(ras, occ[, c('lon', 'lat')]))

	pa_covs <- as.matrix(extract(ras, pa))

	covs <- rbind(occ_covs,
	pa_covs)

	# combine with the occurrence data
	df <- data.frame(value = rep(c(1, 0),
	c(npres, npabs)),
	type = rep(c('presence', 'background'),
	c(npres, npabs)),
	lon = c(occ$lon, pa[, 1]),
	lat = c(occ$lat, pa[, 2]),
	covs)

	names(df)[5:ncol(df)] <- names(ras)

	return(df)

	}

	# ~~~~~~~~~~~~
	# model modules

	# input:
	# df - a dataframe, the output from a process module

	# output: a model object with a valid predict method

	# note that the current set up only works for models with a predict method
	# which takes the argument: type = 'response'
	# obviously we'll have to work around that.

	# ~~~

	# modelLR:
	# model module to fit a simple logistic regression model

	modelLR <- function (df) {
	if (!all(df$type %in% c('presence', 'absence', 'background'))) {
	stop ('only for presence/absence or presence/background data')
	}

	covs <- as.data.frame(df[, 5:ncol(df)])
	names(covs) <- names(df)[5:ncol(df)]
	m <- glm(df$value ~ .,
	data = covs,
	family = binomial)

	return (m)
	}

	# modelRF:
	# model module to fit a simple RandomForest classification model

	modelRF <- function (df) {

	require ('randomForest')

	if (!all(df$type %in% c('presence', 'absence', 'background'))) {
	stop ('only for presence/absence or presence/background data')
	}

	covs <- as.data.frame(df[, 5:ncol(df)])
	names(covs) <- names(df)[5:ncol(df)]
	m <- randomForest(df$value ~ .,
	data = covs,
	weights = rep(1, nrow(covs)),
	size = 1)

	return (m)
	}

	# ~~~~~~~~~~~~
	# map modules

	# input:
	# model - a model object, the output from a model module
	# ras - a Raster* object, the output from a covariate module

	# output: a Raster object giving the probabilistic model predictions for each
	# cell of ras

	mapA <- function (model, ras) {

	vals <- data.frame(getValues(ras))
	colnames(vals) <- names(ras)

	pred <- predict(model,
	newdata = vals,
	type = 'response')

	pred_ras <- ras[[1]]

	pred_ras <- setValues(pred_ras, pred)

	return(pred_ras)

	}

	# ~~~~~~~~~~~~
	# workflow wrapper function

	# input: an extent and one of each of the modules
	# output: a list with the extent, the results of each module and a copy of the
	# code used to execute the workflow (what's there now should be source-able
	# though I'm sure there is a much neater approach than the one I took - the
	# ultimate aim would be a much nicer way of enhancing reproducibility).

	workflow <- function(ext,
	occurrenceFn,
	covariateFn,
	processFn,
	modelFn,
	mapFn) {

	# small helper function to dump the code for a function to a text string
	getSource <- function (object) {
	paste(deparse(object), collapse = '\n')
	}

	occ <- occurrenceFn(ext)
	cov <- covariateFn(ext)
	df <- processFn(occ, cov)
	m <- modelFn(df)
	map <- mapFn(m, cov)

	# get the command used to call this function
	bits <- sys.call()
	call <- paste0(bits[1],
	'(',
	paste(bits[-1],
	collapse = ', '),
	')')

	# sorry about this spaghetti...
	workflow <- paste0(paste(bits[-2],
	c(getSource(workflow),
	getSource(occurrenceFn),
	getSource(covariateFn),
	getSource(processFn),
	getSource(modelFn),
	getSource(mapFn)),
	sep = ' <- ',
	collapse = '\n\n'),
	paste0('\n\n',
	bits[2],
	' <- c(',
	paste(ext, collapse = ', '),
	')\n'),
	'\nans <- ',
	call,
	collapse = '\n\n\n')

	return(list(extent = extent,
	occ = occ,
	cov = cov,
	df = df,
	m = m,
	map = map,
	workflow = workflow))
	}



	# ~~~~~~~~~~~~~~
	# test drive it!

	# define the extent in lat and long
	UKextent <- c(xmin = -10,
	xmax = 10,
	ymin = 45,
	ymax = 65)

	# run a workflow, using the logistic regression model
	ans1 <- workflow(ext = UKextent,
	occurrenceFn = occurrenceCp,
	covariateFn = covariateAir,
	processFn = processA,
	modelFn = modelLR,
	mapFn = mapA)

	# switch the model to a RandomForest
	ans2 <- workflow(ext = UKextent,
	occurrenceFn = occurrenceCp,
	covariateFn = covariateAir,
	processFn = processA,
	modelFn = modelRF,
	mapFn = mapA)

	# look at the contents of these lists
	str(ans1, 1)
	str(ans2, 1)

	# plot the resulting maps
	par(mfrow = c(1, 2))

	plot(ans1$map,
	zlim = c(0,1),
	main = 'LR')

	points(ans1$occ[, 1:2],
	pch = 16,
	cex = 0.3)

	plot(ans2$map,
	zlim = c(0,1),
	main = 'RF')

	points(ans2$occ[, 1:2],
	pch = 16,
	cex = 0.3)

	# ~~~~~~~~~~~~~~~
	# test run the script it outputted

	# save the workflow script somewhere
	write(ans1$workflow, file = '~/Desktop/ans1_code.txt')

	# clear everything in the workspace
	rm(list = ls())

	# call the script!
	source('~/Desktop/ans1_code.txt')

	# plot the first map again
	par(mfrow = c(1, 1))

	plot(ans$map,
	zlim = c(0,1),
	main = 'LR')

	points(ans$occ[, 1:2],
	pch = 16,
	cex = 0.3)