SamBuckberry/countOverlaps.R

## countOverlaps.R
#source("http://bioconductor.org/biocLite.R")
#biocLite(c("rtracklayer", "Rsamtools", "GenomicRanges"))

library(rtracklayer)
library(Rsamtools)
library(GenomicRanges)
library(GenomicAlignments)

#' SummarizeOverlapsByGene
#'
#' For counting the number of reads overlapping with feawtures in a TranscriptsDb
#'
#' @param bamFileDir Character string of directory where bam files and indexes are located. Bam files must have and index (.bai) file.
#' @param gtfFile Path to a gtf file output from cufflinks/cuffmerge
#' @param overlapMode Mode of counting reads. Must be "Union", "IntersectionStrict" or "IntersectionNotEmpty"
#' @param ignoreStrand Is strand orientation information to be retained? Logical.
#' @param sinlgeEnd Is the sequence data single-end? Logical. Default=FALSE
#' @param fragments logical value indicating if singletons, reads with unmapped pairs and other \
#' fragments should be included in counting. When fragments=FALSE readGAlignmentPairs is used to \
#' read in the data, when fragments=TRUE readGAlignmentsList is used.readGAlignmentPairs keeps only\
#'  the read pairs mated by the algorithm while readGAlignmentsList keeps the pairs as well as all singletons,\
#'   reads with unmapped pairs and other fragments. When fragments=TRUE counts will generally be higher\
#'    because all records are included in the counting, not just the primary alignment pairs.\
#'     See ?readGAlignmentsListFromBam for the algorithm details.
#' @return featureCounts A data frame with number of read counts for each gene

summarizeOverlapsByGene <- function(bamFiles, gtfFile, overlapMode="IntersectionNotEmpty",
                                    ignoreStrand=FALSE){

        # Import the gtf data and define the feature unit as gene
        gtf0 <- import(con=as.vector(gtfFile), asRangedData=FALSE)
        features <- split(x=gtf0, mcols(gtf0)$gene_id)

        message("GTF file imported")

        # Get a list of bam files from the bam directory
        fls <- bamFiles

        message("Summarizing Gene overlaps")

        #Sumarise overlaps by gene
        countData <- summarizeOverlaps(features=features, reads=fls,
                                       ignore.strand=TRUE, mode=overlapMode)

        #Used in testing ouptput
        head(assays(countData)$counts)

        # Get gene read counts into data frame
        countDataFrame <- data.frame(assays(countData)$counts)

        sampleNames <- lapply(fls, basename)

        colnames(countDataFrame) <- sampleNames
        row.names(countDataFrame) <- row.names(countData)

        featureCounts <- countDataFrame

        return(featureCounts)
}
	#source("http://bioconductor.org/biocLite.R")
	#biocLite(c("rtracklayer", "Rsamtools", "GenomicRanges"))

	library(rtracklayer)
	library(Rsamtools)
	library(GenomicRanges)
	library(GenomicAlignments)

	#' SummarizeOverlapsByGene
	#'
	#' For counting the number of reads overlapping with feawtures in a TranscriptsDb
	#'
	#' @param bamFileDir Character string of directory where bam files and indexes are located. Bam files must have and index (.bai) file.
	#' @param gtfFile Path to a gtf file output from cufflinks/cuffmerge
	#' @param overlapMode Mode of counting reads. Must be "Union", "IntersectionStrict" or "IntersectionNotEmpty"
	#' @param ignoreStrand Is strand orientation information to be retained? Logical.
	#' @param sinlgeEnd Is the sequence data single-end? Logical. Default=FALSE
	#' @param fragments logical value indicating if singletons, reads with unmapped pairs and other \
	#' fragments should be included in counting. When fragments=FALSE readGAlignmentPairs is used to \
	#' read in the data, when fragments=TRUE readGAlignmentsList is used.readGAlignmentPairs keeps only\
	#' the read pairs mated by the algorithm while readGAlignmentsList keeps the pairs as well as all singletons,\
	#' reads with unmapped pairs and other fragments. When fragments=TRUE counts will generally be higher\
	#' because all records are included in the counting, not just the primary alignment pairs.\
	#' See ?readGAlignmentsListFromBam for the algorithm details.
	#' @return featureCounts A data frame with number of read counts for each gene

	summarizeOverlapsByGene <- function(bamFiles, gtfFile, overlapMode="IntersectionNotEmpty",
	ignoreStrand=FALSE){

	# Import the gtf data and define the feature unit as gene
	gtf0 <- import(con=as.vector(gtfFile), asRangedData=FALSE)
	features <- split(x=gtf0, mcols(gtf0)$gene_id)

	message("GTF file imported")

	# Get a list of bam files from the bam directory
	fls <- bamFiles

	message("Summarizing Gene overlaps")

	#Sumarise overlaps by gene
	countData <- summarizeOverlaps(features=features, reads=fls,
	ignore.strand=TRUE, mode=overlapMode)

	#Used in testing ouptput
	head(assays(countData)$counts)

	# Get gene read counts into data frame
	countDataFrame <- data.frame(assays(countData)$counts)

	sampleNames <- lapply(fls, basename)

	colnames(countDataFrame) <- sampleNames
	row.names(countDataFrame) <- row.names(countData)

	featureCounts <- countDataFrame

	return(featureCounts)
	}