Last active
September 8, 2023 11:21
-
-
Save gregorgorjanc/e7ca7a02ed59242573c12a890cf8c871 to your computer and use it in GitHub Desktop.
Simulate sequence reads from AlphaSimR population
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(AlphaSimR) | |
#' @rdname simulateSeqReads | |
#' @title Simulation of sequencing reads (alleles) for bi-allelic loci | |
#' | |
#' @description Simulation of sequencing reads (alleles) for bi-allelic loci | |
#' | |
#' @param geno matrix, genotypes coded as 0, 1, and 2 with individuals in rows | |
#' and loci in columns (see \code{\link{pullSegSiteGeno}}, \code{\link{pullSnpGeno}}, | |
#' and similar functions) | |
#' @param depth numeric, sequencing depth, average number of times a locus is | |
#' sequenced | |
#' @param error numeric, sequencing/alignment/calling error rate [0,1] | |
#' @param model character, PoissonGamma or Poisson | |
#' @param gamma numeric, shape parameter for gamma distribution in the | |
#' PoissonGamma model | |
#' @param pool logical, should the sequence reads from individuals be pooled; | |
#' note that sequencing \code{depth} is still per individuals and we return | |
#' only one "individual" | |
#' @amplify numeric, additional depth for specific loci provided as a vector | |
#' of length equal to the number of loci (see example) | |
#' | |
#' @return matrix with two rows per individual, the first (second) row contains | |
#' the number of sequence reads for the reference (alternative) allele | |
#' | |
#' @details Ideally we would simulate sequence reads and whole alignment and | |
#' calling process, but this would take a lot of time. Instead we simulate | |
#' alele reads directly. | |
#' | |
#' This simulation involves four processes that stack upon each other: | |
#' | |
#' 1) Locus-specific sequencability | |
#' | |
#' This mimcs the fact that some loci in the DNA generate more sequence reads | |
#' or are such reads more aligbnable to reference genome or other locus specific | |
#' stuff. We simulate this sequencability once for the genome and keep it | |
#' fixed across individuals. We use gamma distribution for this with shape = 4 | |
#' and rate = 1 / 4 following some publication I found back in the day from | |
#' the Abecasis lab (TODO CITE). | |
#' | |
#' 2) Number of locus reads for individual | |
#' | |
#' Since sequencing is a random process (we randomly amplify DNA fragments and | |
#' sequence the at random) we get different number of sequence reads per locus | |
#' per individual. We use Poisson distribution for this with mean equal to | |
#' sequencing depth, multiplied by sequencability so we get locus-specific | |
#' sequencing depth. | |
#' | |
#' 3) Allele reads for individual | |
#' | |
#' Depending on the actual genotype of individual we sequence reference or | |
#' alternative alleles. We conveniently sample this from binomial distribution | |
#' with probability for the alternative allele equaling genotype dosage / 2. | |
#' | |
#' 4) Error | |
#' | |
#' A sequence read can be wrong. We sample how many reads are wrong and then | |
#' flip allele read. We flip in both directions at random. | |
#' | |
#' @examples | |
#' founderPop <- quickHaplo(nInd = 3, nChr = 2, segSites = 10) | |
#' SP <- SimParam$new(founderPop) | |
#' pop <- newPop(founderPop) | |
#' | |
#' # True genotype | |
#' (geno <- pullSegSiteGeno(pop)) | |
#' | |
#' # High coverage sequencing (without error) | |
#' simulateSeqReads(geno, depth = 100, error = 0) | |
#' simulateSeqReads(geno, depth = 100, error = 0) # repeat to show the noisy outcome | |
#' | |
#' # High coverage sequencing (with high error) | |
#' simulateSeqReads(geno, depth = 100, error = 0.1) | |
#' simulateSeqReads(geno, depth = 100, error = 0.1) # repeat to show the noisy outcome | |
#' | |
#' # Low coverage sequencing | |
#' simulateSeqReads(geno, depth = 1) | |
#' simulateSeqReads(geno, depth = 1) # repeat to show the noisy outcome | |
#' | |
#' # Pool sequencing | |
#' simulateSeqReads(geno, depth = 30, pool = TRUE) | |
#' simulateSeqReads(geno, depth = 30, pool = TRUE) # repeat to show the noisy outcome | |
#' | |
#' # Low coverage sequencing with amplified loci (one locus per chromosome) | |
#' nLoci <- pop@nLoci[1] | |
#' nChr <- pop@nChr | |
#' amplicon <- rep(0, times = nLoci * nChr) | |
#' tmp <- sample(1:nLoci, size = nChr, replace = TRUE) | |
#' sites <- tmp + (0:(nChr - 1)) * nLoci | |
#' amplicon[sites] <- 30 | |
#' simulateSeqReads(geno, depth = 1, amplify = amplicon) | |
#' simulateSeqReads(geno, depth = 1, amplify = amplicon) # repeat to show the noisy outcome | |
#' | |
#' # Targeted high coverage sequencing (just few sites) | |
#' (targetGeno <- geno[, c(1, 5, 10, 11, 15, 20)]) # select targeted regions/loci | |
#' simulateSeqReads(targetGeno, depth = 100, error = 0) | |
#' @export | |
simulateSeqReads <- function(geno, depth, error = 0.01, | |
model = "PoissonGamma", gamma = 4, | |
pool = FALSE, amplify = 0) { | |
nLoci <- ncol(geno) | |
nInd <- nrow(geno) | |
if (any(geno > 2)) { | |
stop("Works only with diploid genotypes! Pathches are welcome!") | |
} | |
ret <- matrix(data = 0L, nrow = nInd * 2, ncol = nLoci) | |
colnames(ret) <- colnames(geno) | |
rownames(ret) <- c(t(outer(X = rownames(geno), Y = c(0, 1), FUN = paste, sep = "_"))) | |
if (pool) { | |
retPool <- ret[1:2, ] | |
rownames(retPool) <- c("pool_0", "pool_1") | |
} | |
hap1 <- 1 | |
hap2 <- hap1 + 1 | |
depth <- depth + amplify | |
if (model == "PoissonGamma") { | |
depth <- depth * rgamma(n = nLoci, shape = gamma, scale = 1 / gamma) | |
} | |
for (ind in 1:nInd) { | |
# ind <- 1 | |
nReads <- rpois(n = nLoci, lambda = depth) | |
altAllele <- rbinom(n = nLoci, size = nReads, prob = geno[ind, ] / 2) | |
refAllele <- nReads - altAllele | |
if (error > 0) { | |
eAlt <- rbinom(n = nLoci, size = altAllele, prob = error) | |
eRef <- rbinom(n = nLoci, size = refAllele, prob = error) | |
refAllele <- refAllele - eRef + eAlt | |
refAllele[refAllele < 0] <- 0 | |
altAllele <- altAllele - eAlt + eRef | |
altAllele[altAllele < 0] <- 0 | |
} | |
ret[hap1, ] <- refAllele | |
ret[hap2, ] <- altAllele | |
if (pool) { | |
retPool[1, ] <- retPool[1, ] + ret[hap1, ] | |
retPool[2, ] <- retPool[2, ] + ret[hap2, ] | |
} | |
hap1 <- hap1 + 2 | |
hap2 <- hap1 + 1 | |
} | |
if (pool){ | |
ret <- retPool | |
} | |
return(ret) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment