dmckean/parse_samstats.R

## parse_samstats.R
parse_samstats <- function(file) {

  max.col <- max(count.fields(file, sep = "\t"))

  stats.prefixes <- read.table(file, col.names = seq(max.col),
                          colClasses=c("character", rep("NULL", max.col-1)),
                          fill=TRUE)

  stats.nrows <- table(stats.prefixes)[unique(stats.prefixes)[,1]]
  stopifnot(unlist(
      mapply(function(n,x) rep(x, n), stats.nrows, names(stats.nrows))
    ) == stats.prefixes[,1])

  if(inherits(file, "connection")) {
    file.conn <- file
    open(file.conn, open="r")
  } else {
    file.conn <- file(file, open="r")
  }
  stats.list <- lapply(stats.nrows, function(x) {
    read.table(file.conn, nrows=x, sep="\t", stringsAsFactors = FALSE)[-1]
  })
  close(file.conn)

  within(stats.list, {

    # CHK, Checksum    [2]Read Names   [3]Sequences    [4]Qualities
    # CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
    CHK <- as.list(setNames(CHK, c("read_names", "sequences", "qualities")))

    # Summary Numbers:
    # raw total sequences
    # filtered sequences
    # sequences
    # is sorted
    # 1st fragments
    # last fragments
    # reads mapped
    # reads mapped and paired
    # reads unmapped
    # reads properly paired
    # reads paired
    # reads duplicated
    # reads MQ0
    # reads QC failed
    # non-primary alignments
    # total length
    # bases mapped
    # bases mapped (cigar)
    # bases trimmed
    # bases duplicated
    # mismatches
    # error rate
    # average length
    # maximum length
    # average quality
    # insert size average
    # insert size standard deviation
    # inward oriented pairs
    # outward oriented pairs
    # pairs with other orientation
    # pairs on different chromosomes
    SN <- as.list(setNames(SN[,2],
                           strtrim(SN[,1], nchar(SN[,1]) - 1)))
    # First Fragment Qualitites.
    # Columns correspond to qualities and rows to cycles. First column is the cycle number.
    dimnames(FFQ) <- list(FFQ[,1], c("cycle", seq(ncol(FFQ)-1)))

    # Last Fragment Qualitites.
    # Columns correspond to qualities and rows to cycles. First column is the cycle number.
    dimnames(LFQ) <- list(LFQ[,1], c("cycle", seq(ncol(LFQ)-1)))

    # GC Content of first fragments.
    names(GCF) <- c("GC", "count")

    # GC Content of last fragments.
    names(GCF) <- c("GC", "count")

    # ACGT content per cycle.
    # The columns are: cycle;
    #                  A,C,G,T base counts as a percentage of all A/C/G/T bases [%];
    #                  and N and O counts as a percentage of all A/C/G/T bases [%]
    names(GCC) <- c("cycle", "A", "C", "G", "T", "N", "O")

    # Insert sizes.
    # The columns are: insert size, pairs total, inward oriented pairs,
    #  outward oriented pairs, other pairs
    names(IS) <- c("insert_size", "total_pairs", "inward_pairs", "outward_pairs", "other_pairs")

    # Read lengths
    # The columns are: read length, count
    names(RL) <- c("read_length", "count")

    # Indel distribution
    # The columns are: length, number of insertions, number of deletions
    names(ID) <- c("length", "insertion_count", "deletion_count")

    # Indels per cycle
    # The columns are: cycle,
    #                  number of insertions (fwd)
    #                  number of insertions (rev)
    #                  number of deletions (fwd)
    #                  number of deletions (rev)
    names(IC) <- c("cycle", "ins_fwd", "ins_rev", "del_fwd", "del_rev")

    # Coverage distribution
    names(COV) <- c("coverage_range", "coverage", "bases")

    # GC-depth
    # The columns are: GC%, unique sequence percentiles,
    #                  10th, 25th, 50th, 75th and 90th depth percentile
    names(GCD) <- c("GC", "GC_percentile_for_mapped_sequences",
                    "gc.10", "gc.26", "gc.50", "gc.75", "gc.90")

  })

}
	parse_samstats <- function(file) {

	max.col <- max(count.fields(file, sep = "\t"))

	stats.prefixes <- read.table(file, col.names = seq(max.col),
	colClasses=c("character", rep("NULL", max.col-1)),
	fill=TRUE)

	stats.nrows <- table(stats.prefixes)[unique(stats.prefixes)[,1]]
	stopifnot(unlist(
	mapply(function(n,x) rep(x, n), stats.nrows, names(stats.nrows))
	) == stats.prefixes[,1])

	if(inherits(file, "connection")) {
	file.conn <- file
	open(file.conn, open="r")
	} else {
	file.conn <- file(file, open="r")
	}
	stats.list <- lapply(stats.nrows, function(x) {
	read.table(file.conn, nrows=x, sep="\t", stringsAsFactors = FALSE)[-1]
	})
	close(file.conn)

	within(stats.list, {

	# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities
	# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
	CHK <- as.list(setNames(CHK, c("read_names", "sequences", "qualities")))

	# Summary Numbers:
	# raw total sequences
	# filtered sequences
	# sequences
	# is sorted
	# 1st fragments
	# last fragments
	# reads mapped
	# reads mapped and paired
	# reads unmapped
	# reads properly paired
	# reads paired
	# reads duplicated
	# reads MQ0
	# reads QC failed
	# non-primary alignments
	# total length
	# bases mapped
	# bases mapped (cigar)
	# bases trimmed
	# bases duplicated
	# mismatches
	# error rate
	# average length
	# maximum length
	# average quality
	# insert size average
	# insert size standard deviation
	# inward oriented pairs
	# outward oriented pairs
	# pairs with other orientation
	# pairs on different chromosomes
	SN <- as.list(setNames(SN[,2],
	strtrim(SN[,1], nchar(SN[,1]) - 1)))
	# First Fragment Qualitites.
	# Columns correspond to qualities and rows to cycles. First column is the cycle number.
	dimnames(FFQ) <- list(FFQ[,1], c("cycle", seq(ncol(FFQ)-1)))

	# Last Fragment Qualitites.
	# Columns correspond to qualities and rows to cycles. First column is the cycle number.
	dimnames(LFQ) <- list(LFQ[,1], c("cycle", seq(ncol(LFQ)-1)))

	# GC Content of first fragments.
	names(GCF) <- c("GC", "count")

	# GC Content of last fragments.
	names(GCF) <- c("GC", "count")

	# ACGT content per cycle.
	# The columns are: cycle;
	# A,C,G,T base counts as a percentage of all A/C/G/T bases [%];
	# and N and O counts as a percentage of all A/C/G/T bases [%]
	names(GCC) <- c("cycle", "A", "C", "G", "T", "N", "O")

	# Insert sizes.
	# The columns are: insert size, pairs total, inward oriented pairs,
	# outward oriented pairs, other pairs
	names(IS) <- c("insert_size", "total_pairs", "inward_pairs", "outward_pairs", "other_pairs")

	# Read lengths
	# The columns are: read length, count
	names(RL) <- c("read_length", "count")

	# Indel distribution
	# The columns are: length, number of insertions, number of deletions
	names(ID) <- c("length", "insertion_count", "deletion_count")

	# Indels per cycle
	# The columns are: cycle,
	# number of insertions (fwd)
	# number of insertions (rev)
	# number of deletions (fwd)
	# number of deletions (rev)
	names(IC) <- c("cycle", "ins_fwd", "ins_rev", "del_fwd", "del_rev")

	# Coverage distribution
	names(COV) <- c("coverage_range", "coverage", "bases")

	# GC-depth
	# The columns are: GC%, unique sequence percentiles,
	# 10th, 25th, 50th, 75th and 90th depth percentile
	names(GCD) <- c("GC", "GC_percentile_for_mapped_sequences",
	"gc.10", "gc.26", "gc.50", "gc.75", "gc.90")

	})

	}