Skip to content

Instantly share code, notes, and snippets.

@DarioS
Created May 25, 2011 00:41
Show Gist options
  • Save DarioS/990079 to your computer and use it in GitHub Desktop.
Save DarioS/990079 to your computer and use it in GitHub Desktop.
Count sequencing reads in exons and over junctions.
setGeneric("rnaCounts", function(alns, tx.db, ...) {standardGeneric("rnaCounts")})
setMethod("unique", "GRangesList",
function(x)
{
gr <- unname(unlist(x))
values(gr)$gene <- rep(names(x), elementLengths(x))
gr.df <- as.data.frame(gr)
gr.df <- gr.df[!duplicated(gr.df), ]
split(GRanges(gr.df$seqnames, IRanges(gr.df$start, gr.df$end), gr.df$strand),
factor(gr.df$gene, levels = names(x)))
})
# Get junctions from introns by transcripts.
.junctsByGene <- function(tx.db)
{
db.list <- as.list(tx.db)
tx.juncts <- intronsByTranscript(tx.db)
all.juncts <- unlist(tx.juncts)
names(all.juncts) <- values(all.juncts) <- NULL
tx_id <- rep(1:length(tx.juncts), elementLengths(tx.juncts))
gene_id <- factor(db.list$genes$gene_id[match(tx_id, db.list$genes$tx_id)],
unique(db.list$genes$gene_id))
unique(split(all.juncts, gene_id))
}
library(org.Hs.eg.db)
# alns is a list of GappedAlignments
setMethod("rnaCounts", c("list", "TranscriptDb"),
function(alns, tx.db, genes.symbols = as.list(org.Hs.egSYMBOL), verbose = TRUE)
{
if(verbose) message("Gathering exons for each gene.")
gene.exons <- exonsBy(tx.db, "gene")
exons.genes <- rep(names(gene.exons), elementLengths(gene.exons))
exons <- unlist(gene.exons)
values(exons) <- NULL
gene.exons <- split(exons, exons.genes)
if(verbose) message("Disjoining overlapping exons. This may take a few minutes.")
exons.disjoint <- isDisjoint(gene.exons)
gene.exons[!exons.disjoint] <- endoapply(gene.exons[!exons.disjoint], disjoin)
if(verbose) message("Determining junctions for each gene.")
gene.juncts <- .junctsByGene(tx.db)[names(gene.exons)]
if(verbose) message("Determining intronic regions for each gene.")
intronics <- psetdiff(gene.juncts, gene.exons)
# findOverlaps can't do 'within' overlaps when a chr is circular.
isCircular(seqinfo(gene.exons)) <- rep(FALSE, length(seqlevels(gene.exons)))
isCircular(seqinfo(gene.juncts)) <- rep(FALSE, length(seqlevels(gene.juncts)))
isCircular(seqinfo(intronics)) <- rep(FALSE, length(seqlevels(intronics)))
if(verbose) message("Finding alignment gaps.")
alns.gaps <- lapply(alns, function(x)
{
gaps.grl <- grglist(x[ngap(x) > 0])
alns.exons <- elementLengths(gaps.grl)
exons.sums <- cumsum(alns.exons)
index.offsets <- c(0, exons.sums[-length(exons.sums)])
left.exons <- unlist(mapply(function(x, y) y + (1:(x - 1)),
as.list(alns.exons), as.list(index.offsets),
SIMPLIFY = FALSE))
right.exons <- unlist(mapply(function(x, y) y + (2:x),
as.list(alns.exons), as.list(index.offsets),
SIMPLIFY = FALSE))
gaps.gr <- unlist(gaps.grl)
gaps.ranges <- ranges(gaps.gr)
GRanges(seqnames(gaps.gr[left.exons]),
pgap(gaps.ranges[left.exons], gaps.ranges[right.exons]),
strand(gaps.gr[left.exons]))
})
if(verbose) message("Counting in exons.")
ex.all <- unname(unlist(gene.exons))
values(ex.all) <- NULL
ex.counts <- lapply(alns, function(x)
table(factor(subjectHits(findOverlaps(granges(x), ex.all, type = "within")), 1:length(ex.all))))
ex.counts <- as.data.frame(do.call(cbind, ex.counts))
ex.counts$gene <- rep(names(gene.exons), elementLengths(gene.exons))
if(verbose) message("Counting across junctions.")
junct.all <- unname(unlist(gene.juncts))
junct.counts <- lapply(alns.gaps, function(x)
countOverlaps(junct.all, x, type = "equal"))
junct.counts <- as.data.frame(do.call(cbind, junct.counts))
junct.counts$gene <- rep(names(gene.juncts), elementLengths(gene.juncts))
if(verbose) message("Counting inside intronic regions.")
intr.all <- unname(unlist(intronics))
intr.counts <- lapply(alns, function(x)
table(factor(subjectHits(findOverlaps(granges(x), intr.all, type = "within")), 1:length(intr.all))))
intr.counts <- as.data.frame(do.call(cbind, intr.counts))
intr.counts$gene <- rep(names(intronics), elementLengths(intronics))
all.counts <- rbind(ex.counts, junct.counts, intr.counts)
if(!is.null(genes.symbols))
{
if(verbose) message("Annotating with gene symbols.")
genes.symbols <- genes.symbols[all.counts$gene]
all.counts$gene.symbol <- sapply(genes.symbols, function(x) ifelse(length(x) > 0, x[1], NA))
}
features.all <- c(ex.all, junct.all, intr.all)
values(features.all)$type <- rep(c("exon", "junction", "intronic"),
c(length(ex.all), length(junct.all), length(intr.all)))
values(features.all) <- cbind(values(features.all), DataFrame(all.counts))
meta.columns <- colnames(values(features.all))
metadata(features.all) <- list(totals = elementLengths(alns),
counts.cols = which(!meta.columns %in% c("type", "gene", "gene.symbol")))
features.all
})
@DarioS
Copy link
Author

DarioS commented Jul 20, 2011

20 July : The exon counts part only includes reads that fall wholly within an exon now.

@DarioS
Copy link
Author

DarioS commented Aug 23, 2011

Overlapping exons are partitioned. Total reads and column indexes of count data are stored in GRanges metadata, making less variables to be passed to isoDiff and to calcFPKMs.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment