Skip to content

Instantly share code, notes, and snippets.

@tgirke
Last active February 21, 2018 04:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tgirke/57b32e2c86b4912a13c6bec60255333b to your computer and use it in GitHub Desktop.
Save tgirke/57b32e2c86b4912a13c6bec60255333b to your computer and use it in GitHub Desktop.
###################################################################
## Alignment Stats with Support for FASTQ Files with >500M Reads ##
###################################################################
alignStats <- function(args) {
fqpaths <- infile1(args)
bampaths <- outpaths(args)
bamexists <- file.exists(bampaths)
fqpaths <- fqpaths[bamexists]
bampaths <- bampaths[bamexists]
## Obtain total read number from FASTQ files
Nreads <- countLines(fqpaths)/4
names(Nreads) <- names(fqpaths)
## Solve integer overflow problem of countLines when fastq files have more than .Machine$integer.max lines
Nreads[sign(Nreads)==-1] <- NA
if(any(is.na(Nreads))) {
badNreads <- which(is.na(Nreads))
for(i in seq_along(badNreads)) {
srNreads <- 0
f <- ShortRead::FastqStreamer(fqpaths[badNreads[i]], 10^7)
while(length(fq <- yield(f))) {
srNreads <- srNreads + length(fq)
}
close(f)
Nreads[badNreads[i]] <- srNreads
}
}
## If reads are PE multiply by 2 as a rough approximation
if(nchar(infile2(args))[1] > 0) Nreads <- Nreads * 2
## Obtain total number of alignments from BAM files
bfl <- BamFileList(bampaths, yieldSize=50000, index=character())
param <- ScanBamParam(flag=scanBamFlag(isUnmappedQuery=FALSE))
Nalign <- countBam(bfl, param=param)
## Obtain number of primary alignments from BAM files
param <- ScanBamParam(flag=scanBamFlag(isSecondaryAlignment=FALSE, isUnmappedQuery=FALSE))
Nalignprim <- countBam(bfl, param=param)
statsDF <- data.frame(FileName=names(Nreads),
Nreads=Nreads,
Nalign=Nalign$records,
Perc_Aligned=Nalign$records/Nreads*100,
Nalign_Primary=Nalignprim$records,
Perc_Aligned_Primary=Nalignprim$records/Nreads*100
)
if(nchar(infile2(args))[1] > 0) colnames(statsDF)[which(colnames(statsDF)=="Nreads")] <- "Nreads2x"
return(statsDF)
}
## Usage:
# source("https://gist.githubusercontent.com/tgirke/57b32e2c86b4912a13c6bec60255333b/raw/eda035cc519e29d982a575b1aed65f3179013ea5/alignStats.R")
# read_statsDF <- alignStats(args=args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment