Last active
September 30, 2015 06:27
-
-
Save SamBuckberry/a77adf8b0c3b597d1f4c to your computer and use it in GitHub Desktop.
count the number of reads in a BAM file that overlap genomic features defined in GTF file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#source("http://bioconductor.org/biocLite.R") | |
#biocLite(c("rtracklayer", "Rsamtools", "GenomicRanges")) | |
library(rtracklayer) | |
library(Rsamtools) | |
library(GenomicRanges) | |
library(GenomicAlignments) | |
#' SummarizeOverlapsByGene | |
#' | |
#' For counting the number of reads overlapping with feawtures in a TranscriptsDb | |
#' | |
#' @param bamFileDir Character string of directory where bam files and indexes are located. Bam files must have and index (.bai) file. | |
#' @param gtfFile Path to a gtf file output from cufflinks/cuffmerge | |
#' @param overlapMode Mode of counting reads. Must be "Union", "IntersectionStrict" or "IntersectionNotEmpty" | |
#' @param ignoreStrand Is strand orientation information to be retained? Logical. | |
#' @param sinlgeEnd Is the sequence data single-end? Logical. Default=FALSE | |
#' @param fragments logical value indicating if singletons, reads with unmapped pairs and other \ | |
#' fragments should be included in counting. When fragments=FALSE readGAlignmentPairs is used to \ | |
#' read in the data, when fragments=TRUE readGAlignmentsList is used.readGAlignmentPairs keeps only\ | |
#' the read pairs mated by the algorithm while readGAlignmentsList keeps the pairs as well as all singletons,\ | |
#' reads with unmapped pairs and other fragments. When fragments=TRUE counts will generally be higher\ | |
#' because all records are included in the counting, not just the primary alignment pairs.\ | |
#' See ?readGAlignmentsListFromBam for the algorithm details. | |
#' @return featureCounts A data frame with number of read counts for each gene | |
summarizeOverlapsByGene <- function(bamFiles, gtfFile, overlapMode="IntersectionNotEmpty", | |
ignoreStrand=FALSE){ | |
# Import the gtf data and define the feature unit as gene | |
gtf0 <- import(con=as.vector(gtfFile), asRangedData=FALSE) | |
features <- split(x=gtf0, mcols(gtf0)$gene_id) | |
message("GTF file imported") | |
# Get a list of bam files from the bam directory | |
fls <- bamFiles | |
message("Summarizing Gene overlaps") | |
#Sumarise overlaps by gene | |
countData <- summarizeOverlaps(features=features, reads=fls, | |
ignore.strand=TRUE, mode=overlapMode) | |
#Used in testing ouptput | |
head(assays(countData)$counts) | |
# Get gene read counts into data frame | |
countDataFrame <- data.frame(assays(countData)$counts) | |
sampleNames <- lapply(fls, basename) | |
colnames(countDataFrame) <- sampleNames | |
row.names(countDataFrame) <- row.names(countData) | |
featureCounts <- countDataFrame | |
return(featureCounts) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment