Skip to content

Instantly share code, notes, and snippets.

@dmckean
Created November 14, 2017 17:16
Show Gist options
  • Save dmckean/11c97c6f49e19f8dbe23b4bdf3463abd to your computer and use it in GitHub Desktop.
Save dmckean/11c97c6f49e19f8dbe23b4bdf3463abd to your computer and use it in GitHub Desktop.
Parses output from `samtools stats` into a list of lists and data.frames (good for samtools/htslib 1.4-1.6 at least)
parse_samstats <- function(file) {
max.col <- max(count.fields(file, sep = "\t"))
stats.prefixes <- read.table(file, col.names = seq(max.col),
colClasses=c("character", rep("NULL", max.col-1)),
fill=TRUE)
stats.nrows <- table(stats.prefixes)[unique(stats.prefixes)[,1]]
stopifnot(unlist(
mapply(function(n,x) rep(x, n), stats.nrows, names(stats.nrows))
) == stats.prefixes[,1])
if(inherits(file, "connection")) {
file.conn <- file
open(file.conn, open="r")
} else {
file.conn <- file(file, open="r")
}
stats.list <- lapply(stats.nrows, function(x) {
read.table(file.conn, nrows=x, sep="\t", stringsAsFactors = FALSE)[-1]
})
close(file.conn)
within(stats.list, {
# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities
# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
CHK <- as.list(setNames(CHK, c("read_names", "sequences", "qualities")))
# Summary Numbers:
# raw total sequences
# filtered sequences
# sequences
# is sorted
# 1st fragments
# last fragments
# reads mapped
# reads mapped and paired
# reads unmapped
# reads properly paired
# reads paired
# reads duplicated
# reads MQ0
# reads QC failed
# non-primary alignments
# total length
# bases mapped
# bases mapped (cigar)
# bases trimmed
# bases duplicated
# mismatches
# error rate
# average length
# maximum length
# average quality
# insert size average
# insert size standard deviation
# inward oriented pairs
# outward oriented pairs
# pairs with other orientation
# pairs on different chromosomes
SN <- as.list(setNames(SN[,2],
strtrim(SN[,1], nchar(SN[,1]) - 1)))
# First Fragment Qualitites.
# Columns correspond to qualities and rows to cycles. First column is the cycle number.
dimnames(FFQ) <- list(FFQ[,1], c("cycle", seq(ncol(FFQ)-1)))
# Last Fragment Qualitites.
# Columns correspond to qualities and rows to cycles. First column is the cycle number.
dimnames(LFQ) <- list(LFQ[,1], c("cycle", seq(ncol(LFQ)-1)))
# GC Content of first fragments.
names(GCF) <- c("GC", "count")
# GC Content of last fragments.
names(GCF) <- c("GC", "count")
# ACGT content per cycle.
# The columns are: cycle;
# A,C,G,T base counts as a percentage of all A/C/G/T bases [%];
# and N and O counts as a percentage of all A/C/G/T bases [%]
names(GCC) <- c("cycle", "A", "C", "G", "T", "N", "O")
# Insert sizes.
# The columns are: insert size, pairs total, inward oriented pairs,
# outward oriented pairs, other pairs
names(IS) <- c("insert_size", "total_pairs", "inward_pairs", "outward_pairs", "other_pairs")
# Read lengths
# The columns are: read length, count
names(RL) <- c("read_length", "count")
# Indel distribution
# The columns are: length, number of insertions, number of deletions
names(ID) <- c("length", "insertion_count", "deletion_count")
# Indels per cycle
# The columns are: cycle,
# number of insertions (fwd)
# number of insertions (rev)
# number of deletions (fwd)
# number of deletions (rev)
names(IC) <- c("cycle", "ins_fwd", "ins_rev", "del_fwd", "del_rev")
# Coverage distribution
names(COV) <- c("coverage_range", "coverage", "bases")
# GC-depth
# The columns are: GC%, unique sequence percentiles,
# 10th, 25th, 50th, 75th and 90th depth percentile
names(GCD) <- c("GC", "GC_percentile_for_mapped_sequences",
"gc.10", "gc.26", "gc.50", "gc.75", "gc.90")
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment