jslefche/README.md

## README.md

      
    Raw
  

              README.md
            
          
    covstop: Coverage-based stopping for biodiversity sampling

A function that takes a sample (rows)-by-species (columns) community matrix and uses coverage-based subsampling to identify the number of samples of individuals required to achieve a given level of coverage of total biodiversity.
From: Chao, Anne, and Lou Jost. "Coverage‐based rarefaction and extrapolation: standardizing samples by completeness rather than size." Ecology 93.12 (2012): 2533-2547.
Example

# generate fake community matrix
vec <- sample(0:100, 100, replace = T)

vec[sample(1:100, 50)] <- 0

mat <- matrix(vec, nrow = 10)

# determine number of samples required
covstop(mat, type = "samples")

# determine number of samples required for 99% coverage
covstop(mat, Cn = 0.98)

# determine number of individuals required
covstop(mat, type = "individuals")


## covstop.R
#' covstop: a function to implement coverage-based stopping
#'
#' @param comm a site (rows)-by-species (columns) data.frame
#' @param type whether the output should be in number of individuals ("individuals") or in
#' number of sampling units ("samples")
#' @param Cn desired level of coverage
#' @param se whether bootstrapped standard errors should be returned
#' @param knots a number of knots of computation, default is 40
#'
#' @return the number of individuals or samples required to meet the desired level of coverage
#'
#' @examples
#' comm <- sample(100)
#'
#' comm[sample(1:100, 50, replace = F)] <- 0
#'
#' comm <- matrix(comm, nrow = 10)
#'
#' covstop(comm)
#'
covstop <- function(comm, type = "samples", Cn = 0.90, se = FALSE, knots = 40) {

  if(Cn > 1) stop("Coverage level must be between (0, 1)")

  if(type == "samples") comm <- data.frame(N = nrow(comm), t(colSums(comm > 0)))

  comm <- split(comm, 1:nrow(comm))

  sc <- sapply(comm, function(Spec) {

    Spec <- as.numeric(Spec)

    endpoint <- 2 * sum(Spec)

    if(type == "individuals") {

      n <- sum(Spec)

      if (endpoint <= n) {

          m <- floor(seq(1, endpoint, length = floor(knots)))

        } else {

          m <- c(floor(seq(1, sum(Spec) - 1, length.out = floor(knots/2) - 1)), sum(Spec), floor(seq(sum(Spec) + 1, to = endpoint, length.out = floor(knots/2))))

          }

      m <- c(1, m[-1])

      C <- Chat.Ind(Spec, m)

      # if(se == TRUE) {
      #
      #   Prob.hat <- EstiBootComm.Ind(Spec)
      #
      #   Abun.Mat <- rmultinom(nboot, n, Prob.hat)
      #
      #   error.C <- qnorm(1 - (1 - 0.95) / 2) * apply(apply(Abun.Mat, 2, function(x) Chat.Ind(x, m)), 1, sd, na.rm = TRUE)
      #
      # }

    } else if(type == "samples") {

      nT <- Spec[1]

      if(endpoint <= nT) {

          m <- floor(seq(1, endpoint, length.out = floor(knots)))

          } else {

            m <- c(floor(seq(1, nT - 1, length.out = floor(knots/2) - 1)), nT, floor(seq(nT + 1, to = endpoint, length.out = floor(knots/2))))

            }

        m <- c(1, m[-1])

      C <- Chat.Sam(Spec, m)

    }


    m[which.min(C <= Cn)]

  } )

  # if(se == TRUE)
  #
  #   paste0("Number of ", type, " needed to reach ", Cn*100, "% coverage: ", as.numeric(sc[which.max(sc)]), "+/-", as.numeric(error.C))

  x <- as.numeric(sc[which.max(sc)])

  # print(paste0("Number of ", type, " needed to reach ", Cn*100, "% coverage: ", x))

  x

}

### helper functions from iNEXT
### credit: T.C. Hsieh
Chat.Ind <- function (x, m) {
  x <- x[x > 0]
  n <- sum(x)
  f1 <- sum(x == 1)
  f2 <- sum(x == 2)
  f0.hat <- ifelse(f2 == 0, (n - 1)/n * f1 * (f1 - 1)/2, (n -
                                                            1)/n * f1^2/2/f2)
  A <- ifelse(f1 > 0, n * f0.hat/(n * f0.hat + f1), 1)
  Sub <- function(m) {
    if (m < n) {
      xx <- x[(n - x) >= m]
      out <- 1 - sum(xx/n * exp(lgamma(n - xx + 1) - lgamma(n -
                                                              xx - m + 1) - lgamma(n) + lgamma(n - m)))
    }
    if (m == n)
      out <- 1 - f1/n * A
    if (m > n)
      out <- 1 - f1/n * A^(m - n + 1)
    out
  }
  sapply(m, Sub)
}

Chat.Sam <- function(x, t){
  nT <- x[1]
  y <- x[-1]
  y <- y[y>0]
  U <- sum(y)
  Q1 <- sum(y == 1)
  Q2 <- sum(y == 2)
  Q0.hat <- ifelse(Q2 == 0, (nT - 1) / nT * Q1 * (Q1 - 1) / 2, (nT - 1) / nT * Q1 ^ 2/ 2 / Q2)  #estimation of unseen species via Chao2
  A <- ifelse(Q1>0, nT*Q0.hat/(nT*Q0.hat+Q1), 1)
  Sub <- function(t){
    if(t < nT) {
      yy <- y[(nT-y)>=t]
      out <- 1 - sum(yy / U * exp(lgamma(nT-yy+1)-lgamma(nT-yy-t+1)-lgamma(nT)+lgamma(nT-t)))
    }
    #if(t < nT) out <- 1 - sum(y / U * exp(lchoose(nT - y, t) - lchoose(nT - 1, t)))
    if(t == nT) out <- 1 - Q1 / U * A
    if(t > nT) out <- 1 - Q1 / U * A^(t - nT + 1)
    out
  }
  sapply(t, Sub)
}

EstiBootComm.Ind <- function(Spec)
{
  Sobs <- sum(Spec > 0)   #observed species
  n <- sum(Spec)        #sample size
  f1 <- sum(Spec == 1)   #singleton
  f2 <- sum(Spec == 2)   #doubleton
  f0.hat <- ifelse(f2 == 0, (n - 1) / n * f1 * (f1 - 1) / 2, (n - 1) / n * f1 ^ 2/ 2 / f2)  #estimation of unseen species via Chao1
  A <- ifelse(f1>0, n*f0.hat/(n*f0.hat+f1), 1)
  a <- f1/n*A
  b <- sum(Spec / n * (1 - Spec / n) ^ n)
  if(f0.hat==0){
    w <- 0
    if(sum(Spec>0)==1){
      warning("This site has only one species. Estimation is not robust.")
    }
  }else{
    w <- a / b      	#adjusted factor for rare species in the sample
  }
  Prob.hat <- Spec / n * (1 - w * (1 - Spec / n) ^ n)					#estimation of relative abundance of observed species in the sample
  Prob.hat.Unse <- rep(a/ceiling(f0.hat), ceiling(f0.hat))  	#estimation of relative abundance of unseen species in the sample
  return(sort(c(Prob.hat, Prob.hat.Unse), decreasing=TRUE))		  							#Output: a vector of estimated relative abundance
}

EstiBootComm.Sam <- function(Spec)
{
  nT <- Spec[1]
  Spec <- Spec[-1]
  Sobs <- sum(Spec > 0)   #observed species
  Q1 <- sum(Spec == 1) 	#singleton
  Q2 <- sum(Spec == 2) 	#doubleton
  Q0.hat <- ifelse(Q2 == 0, (nT - 1) / nT * Q1 * (Q1 - 1) / 2, (nT - 1) / nT * Q1 ^ 2/ 2 / Q2)	#estimation of unseen species via Chao2
  A <- ifelse(Q1>0, nT*Q0.hat/(nT*Q0.hat+Q1), 1)
  a <- Q1/nT*A
  b <- sum(Spec / nT * (1 - Spec / nT) ^ nT)

  if(Q0.hat==0){
    w <- 0
    if(sum(Spec>0)==1){
      warning("This site has only one species. Estimation is not robust.")
    }
  }else{
    w <- a / b      	#adjusted factor for rare species in the sample
  }

  Prob.hat <- Spec / nT * (1 - w * (1 - Spec / nT) ^ nT)					#estimation of detection probability of observed species in the sample
  Prob.hat.Unse <- rep(a/ceiling(Q0.hat), ceiling(Q0.hat))  	#estimation of detection probability of unseen species in the sample
  return(sort(c(Prob.hat, Prob.hat.Unse), decreasing=TRUE))									#Output: a vector of estimated detection probability
}
	#' covstop: a function to implement coverage-based stopping
	#'
	#' @param comm a site (rows)-by-species (columns) data.frame
	#' @param type whether the output should be in number of individuals ("individuals") or in
	#' number of sampling units ("samples")
	#' @param Cn desired level of coverage
	#' @param se whether bootstrapped standard errors should be returned
	#' @param knots a number of knots of computation, default is 40
	#'
	#' @return the number of individuals or samples required to meet the desired level of coverage
	#'
	#' @examples
	#' comm <- sample(100)
	#'
	#' comm[sample(1:100, 50, replace = F)] <- 0
	#'
	#' comm <- matrix(comm, nrow = 10)
	#'
	#' covstop(comm)
	#'
	covstop <- function(comm, type = "samples", Cn = 0.90, se = FALSE, knots = 40) {

	if(Cn > 1) stop("Coverage level must be between (0, 1)")

	if(type == "samples") comm <- data.frame(N = nrow(comm), t(colSums(comm > 0)))

	comm <- split(comm, 1:nrow(comm))

	sc <- sapply(comm, function(Spec) {

	Spec <- as.numeric(Spec)

	endpoint <- 2 * sum(Spec)

	if(type == "individuals") {

	n <- sum(Spec)

	if (endpoint <= n) {

	m <- floor(seq(1, endpoint, length = floor(knots)))

	} else {

	m <- c(floor(seq(1, sum(Spec) - 1, length.out = floor(knots/2) - 1)), sum(Spec), floor(seq(sum(Spec) + 1, to = endpoint, length.out = floor(knots/2))))

	}

	m <- c(1, m[-1])

	C <- Chat.Ind(Spec, m)

	# if(se == TRUE) {
	#
	# Prob.hat <- EstiBootComm.Ind(Spec)
	#
	# Abun.Mat <- rmultinom(nboot, n, Prob.hat)
	#
	# error.C <- qnorm(1 - (1 - 0.95) / 2) * apply(apply(Abun.Mat, 2, function(x) Chat.Ind(x, m)), 1, sd, na.rm = TRUE)
	#
	# }

	} else if(type == "samples") {

	nT <- Spec[1]

	if(endpoint <= nT) {

	m <- floor(seq(1, endpoint, length.out = floor(knots)))

	} else {

	m <- c(floor(seq(1, nT - 1, length.out = floor(knots/2) - 1)), nT, floor(seq(nT + 1, to = endpoint, length.out = floor(knots/2))))

	}

	m <- c(1, m[-1])

	C <- Chat.Sam(Spec, m)

	}


	m[which.min(C <= Cn)]

	} )

	# if(se == TRUE)
	#
	# paste0("Number of ", type, " needed to reach ", Cn*100, "% coverage: ", as.numeric(sc[which.max(sc)]), "+/-", as.numeric(error.C))

	x <- as.numeric(sc[which.max(sc)])

	# print(paste0("Number of ", type, " needed to reach ", Cn*100, "% coverage: ", x))

	x

	}

	### helper functions from iNEXT
	### credit: T.C. Hsieh
	Chat.Ind <- function (x, m) {
	x <- x[x > 0]
	n <- sum(x)
	f1 <- sum(x == 1)
	f2 <- sum(x == 2)
	f0.hat <- ifelse(f2 == 0, (n - 1)/n * f1 * (f1 - 1)/2, (n -
	1)/n * f1^2/2/f2)
	A <- ifelse(f1 > 0, n * f0.hat/(n * f0.hat + f1), 1)
	Sub <- function(m) {
	if (m < n) {
	xx <- x[(n - x) >= m]
	out <- 1 - sum(xx/n * exp(lgamma(n - xx + 1) - lgamma(n -
	xx - m + 1) - lgamma(n) + lgamma(n - m)))
	}
	if (m == n)
	out <- 1 - f1/n * A
	if (m > n)
	out <- 1 - f1/n * A^(m - n + 1)
	out
	}
	sapply(m, Sub)
	}

	Chat.Sam <- function(x, t){
	nT <- x[1]
	y <- x[-1]
	y <- y[y>0]
	U <- sum(y)
	Q1 <- sum(y == 1)
	Q2 <- sum(y == 2)
	Q0.hat <- ifelse(Q2 == 0, (nT - 1) / nT * Q1 * (Q1 - 1) / 2, (nT - 1) / nT * Q1 ^ 2/ 2 / Q2) #estimation of unseen species via Chao2
	A <- ifelse(Q1>0, nTQ0.hat/(nTQ0.hat+Q1), 1)
	Sub <- function(t){
	if(t < nT) {
	yy <- y[(nT-y)>=t]
	out <- 1 - sum(yy / U * exp(lgamma(nT-yy+1)-lgamma(nT-yy-t+1)-lgamma(nT)+lgamma(nT-t)))
	}
	#if(t < nT) out <- 1 - sum(y / U * exp(lchoose(nT - y, t) - lchoose(nT - 1, t)))
	if(t == nT) out <- 1 - Q1 / U * A
	if(t > nT) out <- 1 - Q1 / U * A^(t - nT + 1)
	out
	}
	sapply(t, Sub)
	}

	EstiBootComm.Ind <- function(Spec)
	{
	Sobs <- sum(Spec > 0) #observed species
	n <- sum(Spec) #sample size
	f1 <- sum(Spec == 1) #singleton
	f2 <- sum(Spec == 2) #doubleton
	f0.hat <- ifelse(f2 == 0, (n - 1) / n * f1 * (f1 - 1) / 2, (n - 1) / n * f1 ^ 2/ 2 / f2) #estimation of unseen species via Chao1
	A <- ifelse(f1>0, nf0.hat/(nf0.hat+f1), 1)
	a <- f1/n*A
	b <- sum(Spec / n * (1 - Spec / n) ^ n)
	if(f0.hat==0){
	w <- 0
	if(sum(Spec>0)==1){
	warning("This site has only one species. Estimation is not robust.")
	}
	}else{
	w <- a / b #adjusted factor for rare species in the sample
	}
	Prob.hat <- Spec / n * (1 - w * (1 - Spec / n) ^ n) #estimation of relative abundance of observed species in the sample
	Prob.hat.Unse <- rep(a/ceiling(f0.hat), ceiling(f0.hat)) #estimation of relative abundance of unseen species in the sample
	return(sort(c(Prob.hat, Prob.hat.Unse), decreasing=TRUE)) #Output: a vector of estimated relative abundance
	}

	EstiBootComm.Sam <- function(Spec)
	{
	nT <- Spec[1]
	Spec <- Spec[-1]
	Sobs <- sum(Spec > 0) #observed species
	Q1 <- sum(Spec == 1) #singleton
	Q2 <- sum(Spec == 2) #doubleton
	Q0.hat <- ifelse(Q2 == 0, (nT - 1) / nT * Q1 * (Q1 - 1) / 2, (nT - 1) / nT * Q1 ^ 2/ 2 / Q2) #estimation of unseen species via Chao2
	A <- ifelse(Q1>0, nTQ0.hat/(nTQ0.hat+Q1), 1)
	a <- Q1/nT*A
	b <- sum(Spec / nT * (1 - Spec / nT) ^ nT)

	if(Q0.hat==0){
	w <- 0
	if(sum(Spec>0)==1){
	warning("This site has only one species. Estimation is not robust.")
	}
	}else{
	w <- a / b #adjusted factor for rare species in the sample
	}

	Prob.hat <- Spec / nT * (1 - w * (1 - Spec / nT) ^ nT) #estimation of detection probability of observed species in the sample
	Prob.hat.Unse <- rep(a/ceiling(Q0.hat), ceiling(Q0.hat)) #estimation of detection probability of unseen species in the sample
	return(sort(c(Prob.hat, Prob.hat.Unse), decreasing=TRUE)) #Output: a vector of estimated detection probability
	}