gregorgorjanc/simulateSeqReads.R

## simulateSeqReads.R
library(AlphaSimR)

#' @rdname simulateSeqReads
#' @title Simulation of sequencing reads (alleles) for bi-allelic loci
#'
#' @description Simulation of sequencing reads (alleles) for bi-allelic loci
#'
#' @param geno matrix, genotypes coded as 0, 1, and 2 with individuals in rows
#'   and loci in columns (see \code{\link{pullSegSiteGeno}}, \code{\link{pullSnpGeno}},
#'   and similar functions)
#' @param depth numeric, sequencing depth, average number of times a locus is
#'   sequenced
#' @param error numeric, sequencing/alignment/calling error rate [0,1]
#' @param model character, PoissonGamma or Poisson
#' @param gamma numeric, shape parameter for gamma distribution in the
#'   PoissonGamma model
#' @param pool logical, should the sequence reads from individuals be pooled;
#'   note that sequencing \code{depth} is still per individuals and we return
#'   only one "individual"
#' @amplify numeric, additional depth for specific loci provided as a vector
#'   of length equal to the number of loci (see example)
#'
#' @return matrix with two rows per individual, the first (second) row contains
#'   the number of sequence reads for the reference (alternative) allele
#'
#' @details Ideally we would simulate sequence reads and whole alignment and
#'   calling process, but this would take a lot of time. Instead we simulate
#'   alele reads directly.
#'
#'   This simulation involves four processes that stack upon each other:
#'
#'   1) Locus-specific sequencability
#'
#'   This mimcs the fact that some loci in the DNA generate more sequence reads
#'   or are such reads more aligbnable to reference genome or other locus specific
#'   stuff. We simulate this sequencability once for the genome and keep it
#'   fixed across individuals. We use gamma distribution for this with shape = 4
#'   and rate = 1 / 4 following some publication I found back in the day from
#'   the Abecasis lab (TODO CITE).
#'
#'   2) Number of locus reads for individual
#'
#'   Since sequencing is a random process (we randomly amplify DNA fragments and
#'   sequence the at random) we get different number of sequence reads per locus
#'   per individual. We use Poisson distribution for this with mean equal to
#'   sequencing depth, multiplied by sequencability so we get locus-specific
#'   sequencing depth.
#'
#'   3) Allele reads for individual
#'
#'   Depending on the actual genotype of individual we sequence reference or
#'   alternative alleles. We conveniently sample this from binomial distribution
#'   with probability for the alternative allele equaling genotype dosage / 2.
#'
#'   4) Error
#'
#'   A sequence read can be wrong. We sample how many reads are wrong and then
#'   flip allele read. We flip in both directions at random.
#'
#' @examples
#' founderPop <- quickHaplo(nInd = 3, nChr = 2, segSites = 10)
#' SP <- SimParam$new(founderPop)
#' pop <- newPop(founderPop)
#'
#' # True genotype
#' (geno <- pullSegSiteGeno(pop))
#'
#' # High coverage sequencing (without error)
#' simulateSeqReads(geno, depth = 100, error = 0)
#' simulateSeqReads(geno, depth = 100, error = 0) # repeat to show the noisy outcome
#'
#' # High coverage sequencing (with high error)
#' simulateSeqReads(geno, depth = 100, error = 0.1)
#' simulateSeqReads(geno, depth = 100, error = 0.1) # repeat to show the noisy outcome
#'
#' # Low coverage sequencing
#' simulateSeqReads(geno, depth = 1)
#' simulateSeqReads(geno, depth = 1) # repeat to show the noisy outcome
#'
#' # Pool sequencing
#' simulateSeqReads(geno, depth = 30, pool = TRUE)
#' simulateSeqReads(geno, depth = 30, pool = TRUE)  # repeat to show the noisy outcome
#'
#' # Low coverage sequencing with amplified loci (one locus per chromosome)
#' nLoci <- pop@nLoci[1]
#' nChr <- pop@nChr
#' amplicon <- rep(0, times = nLoci * nChr)
#' tmp <- sample(1:nLoci, size = nChr, replace = TRUE)
#' sites <- tmp + (0:(nChr - 1)) * nLoci
#' amplicon[sites] <- 30
#' simulateSeqReads(geno, depth = 1, amplify = amplicon)
#' simulateSeqReads(geno, depth = 1, amplify = amplicon)  # repeat to show the noisy outcome
#'
#' # Targeted high coverage sequencing (just few sites)
#' (targetGeno <- geno[, c(1, 5, 10, 11, 15, 20)]) # select targeted regions/loci
#' simulateSeqReads(targetGeno, depth = 100, error = 0)

#' @export
simulateSeqReads <- function(geno, depth, error = 0.01,
                             model = "PoissonGamma", gamma = 4,
                             pool = FALSE, amplify = 0) {
  nLoci <- ncol(geno)
  nInd <- nrow(geno)
  if (any(geno > 2)) {
    stop("Works only with diploid genotypes! Pathches are welcome!")
  }
  ret <- matrix(data = 0L, nrow = nInd * 2, ncol = nLoci)
  colnames(ret) <- colnames(geno)
  rownames(ret) <- c(t(outer(X = rownames(geno), Y = c(0, 1), FUN = paste, sep = "_")))
  if (pool) {
    retPool <- ret[1:2, ]
    rownames(retPool) <- c("pool_0", "pool_1")
  }

  hap1 <- 1
  hap2 <- hap1 + 1
  depth <- depth + amplify
  if (model == "PoissonGamma") {
    depth <- depth * rgamma(n = nLoci, shape = gamma, scale = 1 / gamma)
  }
  for (ind in 1:nInd) {
    # ind <- 1
    nReads <- rpois(n = nLoci, lambda = depth)
    altAllele <- rbinom(n = nLoci, size = nReads, prob = geno[ind, ] / 2)
    refAllele <- nReads - altAllele
    if (error > 0) {
      eAlt <- rbinom(n = nLoci, size = altAllele, prob = error)
      eRef <- rbinom(n = nLoci, size = refAllele, prob = error)
      refAllele <- refAllele - eRef + eAlt
      refAllele[refAllele < 0] <- 0
      altAllele <- altAllele - eAlt + eRef
      altAllele[altAllele < 0] <- 0
    }
    ret[hap1, ] <- refAllele
    ret[hap2, ] <- altAllele
    if (pool) {
      retPool[1, ] <- retPool[1, ] + ret[hap1, ]
      retPool[2, ] <- retPool[2, ] + ret[hap2, ]
    }
    hap1 <- hap1 + 2
    hap2 <- hap1 + 1
  }
  if (pool){
    ret <- retPool
  }
  return(ret)
}
	library(AlphaSimR)

	#' @rdname simulateSeqReads
	#' @title Simulation of sequencing reads (alleles) for bi-allelic loci
	#'
	#' @description Simulation of sequencing reads (alleles) for bi-allelic loci
	#'
	#' @param geno matrix, genotypes coded as 0, 1, and 2 with individuals in rows
	#' and loci in columns (see \code{\link{pullSegSiteGeno}}, \code{\link{pullSnpGeno}},
	#' and similar functions)
	#' @param depth numeric, sequencing depth, average number of times a locus is
	#' sequenced
	#' @param error numeric, sequencing/alignment/calling error rate [0,1]
	#' @param model character, PoissonGamma or Poisson
	#' @param gamma numeric, shape parameter for gamma distribution in the
	#' PoissonGamma model
	#' @param pool logical, should the sequence reads from individuals be pooled;
	#' note that sequencing \code{depth} is still per individuals and we return
	#' only one "individual"
	#' @amplify numeric, additional depth for specific loci provided as a vector
	#' of length equal to the number of loci (see example)
	#'
	#' @return matrix with two rows per individual, the first (second) row contains
	#' the number of sequence reads for the reference (alternative) allele
	#'
	#' @details Ideally we would simulate sequence reads and whole alignment and
	#' calling process, but this would take a lot of time. Instead we simulate
	#' alele reads directly.
	#'
	#' This simulation involves four processes that stack upon each other:
	#'
	#' 1) Locus-specific sequencability
	#'
	#' This mimcs the fact that some loci in the DNA generate more sequence reads
	#' or are such reads more aligbnable to reference genome or other locus specific
	#' stuff. We simulate this sequencability once for the genome and keep it
	#' fixed across individuals. We use gamma distribution for this with shape = 4
	#' and rate = 1 / 4 following some publication I found back in the day from
	#' the Abecasis lab (TODO CITE).
	#'
	#' 2) Number of locus reads for individual
	#'
	#' Since sequencing is a random process (we randomly amplify DNA fragments and
	#' sequence the at random) we get different number of sequence reads per locus
	#' per individual. We use Poisson distribution for this with mean equal to
	#' sequencing depth, multiplied by sequencability so we get locus-specific
	#' sequencing depth.
	#'
	#' 3) Allele reads for individual
	#'
	#' Depending on the actual genotype of individual we sequence reference or
	#' alternative alleles. We conveniently sample this from binomial distribution
	#' with probability for the alternative allele equaling genotype dosage / 2.
	#'
	#' 4) Error
	#'
	#' A sequence read can be wrong. We sample how many reads are wrong and then
	#' flip allele read. We flip in both directions at random.
	#'
	#' @examples
	#' founderPop <- quickHaplo(nInd = 3, nChr = 2, segSites = 10)
	#' SP <- SimParam$new(founderPop)
	#' pop <- newPop(founderPop)
	#'
	#' # True genotype
	#' (geno <- pullSegSiteGeno(pop))
	#'
	#' # High coverage sequencing (without error)
	#' simulateSeqReads(geno, depth = 100, error = 0)
	#' simulateSeqReads(geno, depth = 100, error = 0) # repeat to show the noisy outcome
	#'
	#' # High coverage sequencing (with high error)
	#' simulateSeqReads(geno, depth = 100, error = 0.1)
	#' simulateSeqReads(geno, depth = 100, error = 0.1) # repeat to show the noisy outcome
	#'
	#' # Low coverage sequencing
	#' simulateSeqReads(geno, depth = 1)
	#' simulateSeqReads(geno, depth = 1) # repeat to show the noisy outcome
	#'
	#' # Pool sequencing
	#' simulateSeqReads(geno, depth = 30, pool = TRUE)
	#' simulateSeqReads(geno, depth = 30, pool = TRUE) # repeat to show the noisy outcome
	#'
	#' # Low coverage sequencing with amplified loci (one locus per chromosome)
	#' nLoci <- pop@nLoci[1]
	#' nChr <- pop@nChr
	#' amplicon <- rep(0, times = nLoci * nChr)
	#' tmp <- sample(1:nLoci, size = nChr, replace = TRUE)
	#' sites <- tmp + (0:(nChr - 1)) * nLoci
	#' amplicon[sites] <- 30
	#' simulateSeqReads(geno, depth = 1, amplify = amplicon)
	#' simulateSeqReads(geno, depth = 1, amplify = amplicon) # repeat to show the noisy outcome
	#'
	#' # Targeted high coverage sequencing (just few sites)
	#' (targetGeno <- geno[, c(1, 5, 10, 11, 15, 20)]) # select targeted regions/loci
	#' simulateSeqReads(targetGeno, depth = 100, error = 0)

	#' @export
	simulateSeqReads <- function(geno, depth, error = 0.01,
	model = "PoissonGamma", gamma = 4,
	pool = FALSE, amplify = 0) {
	nLoci <- ncol(geno)
	nInd <- nrow(geno)
	if (any(geno > 2)) {
	stop("Works only with diploid genotypes! Pathches are welcome!")
	}
	ret <- matrix(data = 0L, nrow = nInd * 2, ncol = nLoci)
	colnames(ret) <- colnames(geno)
	rownames(ret) <- c(t(outer(X = rownames(geno), Y = c(0, 1), FUN = paste, sep = "_")))
	if (pool) {
	retPool <- ret[1:2, ]
	rownames(retPool) <- c("pool_0", "pool_1")
	}

	hap1 <- 1
	hap2 <- hap1 + 1
	depth <- depth + amplify
	if (model == "PoissonGamma") {
	depth <- depth * rgamma(n = nLoci, shape = gamma, scale = 1 / gamma)
	}
	for (ind in 1:nInd) {
	# ind <- 1
	nReads <- rpois(n = nLoci, lambda = depth)
	altAllele <- rbinom(n = nLoci, size = nReads, prob = geno[ind, ] / 2)
	refAllele <- nReads - altAllele
	if (error > 0) {
	eAlt <- rbinom(n = nLoci, size = altAllele, prob = error)
	eRef <- rbinom(n = nLoci, size = refAllele, prob = error)
	refAllele <- refAllele - eRef + eAlt
	refAllele[refAllele < 0] <- 0
	altAllele <- altAllele - eAlt + eRef
	altAllele[altAllele < 0] <- 0
	}
	ret[hap1, ] <- refAllele
	ret[hap2, ] <- altAllele
	if (pool) {
	retPool[1, ] <- retPool[1, ] + ret[hap1, ]
	retPool[2, ] <- retPool[2, ] + ret[hap2, ]
	}
	hap1 <- hap1 + 2
	hap2 <- hap1 + 1
	}
	if (pool){
	ret <- retPool
	}
	return(ret)
	}