/annotatePeaks.R

## annotatePeaks.R
annotatePeaks <- function(peaks, txdb, promoter.region=PromoterVariants()) {
    if (is.numeric(promoter.region)) {
        if (length(promoter.region) < 2)
            promoter.region <- unname(rep(promoter.region, length.out=2))
        promoter.region <- do.call(PromoterVariants, as.list(promoter.region))
    }

    tx <- transcripts(txdb)
    names(tx) <- tx$tx_id
    stopifnot(all(seqlevels(peaks) %in% seqlevels(txdb)))

    txbygene <- transcriptsBy(txdb, "gene")
    tx2entrez <- setNames(rep(names(txbygene), elementLengths(txbygene)),
                          unlist(txbygene)$tx_id)
    tx$gene_id <- tx2entrez[names(tx)]
    suppressWarnings(
        subjectsByTx <- list(
            cds = cdsBy(txdb, "tx"),
            exon = exonsBy(txdb, "tx"),
            intron = intronsByTranscript(txdb),
            fiveUTR = fiveUTRsByTranscript(txdb),
            threeUTR = threeUTRsByTranscript(txdb),
            promoter = promoters(tx, upstream=upstream(promoter.region), downstream=downstream(promoter.region)),
            tss = promoters(tx, upstream=0, downstream=1)))
    subjectsByTx$noncoding <- subjectsByTx$exon[setdiff(names(subjectsByTx$exon), names(subjectsByTx$cds))]
    subjectsByGene <- llply(subjectsByTx, function(subj) {
        if (is(subj, "GRangesList")) {
            split(unlist(subj), tx2entrez[rep(names(subj), elementLengths(subj))])
        } else {
            split(subj, tx2entrez[names(subj)])
        }
    }, .parallel=TRUE)
    hitlists <- llply(subjectsByGene, function(subj) {
        ol <- findOverlaps(peaks, reduce(subj), select="all")
        CharacterList(split(names(subj)[subjectHits(ol)], names(peaks)[queryHits(ol)]))
    }, .parallel=TRUE)
    ## Feature types earlier in the list have priority
    feature.priority <- c("tss", "promoter", "fiveUTR", "cds", "threeUTR", "noncoding", "exon", "intron")
    res <- DataFrame(
        row.names=names(peaks), peak_id=names(peaks),
        OverlapEntrez=CharacterList(NA), OverlapType=factor(NA, levels=feature.priority),
        NearTSSEntrez=as.character(NA), DistanceToTSS=as.numeric(NA))
    for (i in rev(feature.priority)) {
        hitpeaks <- names(hitlists[[i]])
        res[hitpeaks,"OverlapEntrez"] <- hitlists[[i]]
        res[hitpeaks,"OverlapType"] <- i
    }

    tss.only.with.entrez <- unlist(unname(subjectsByGene$tss))
    nearest.tss <- distanceToNearest(peaks, tss.only.with.entrez, select="arbitrary")
    nearest.tss.txid <- names(tss.only.with.entrez)[subjectHits(nearest.tss)]
    mcols(nearest.tss)$ENTREZ <- tx2entrez[nearest.tss.txid]
    res[queryHits(nearest.tss),c("NearTSSEntrez", "DistanceToTSS")] <-
        mcols(nearest.tss)[c("ENTREZ", "distance")]
    mcols(peaks) <- res
    peaks
}
	annotatePeaks <- function(peaks, txdb, promoter.region=PromoterVariants()) {
	if (is.numeric(promoter.region)) {
	if (length(promoter.region) < 2)
	promoter.region <- unname(rep(promoter.region, length.out=2))
	promoter.region <- do.call(PromoterVariants, as.list(promoter.region))
	}

	tx <- transcripts(txdb)
	names(tx) <- tx$tx_id
	stopifnot(all(seqlevels(peaks) %in% seqlevels(txdb)))

	txbygene <- transcriptsBy(txdb, "gene")
	tx2entrez <- setNames(rep(names(txbygene), elementLengths(txbygene)),
	unlist(txbygene)$tx_id)
	tx$gene_id <- tx2entrez[names(tx)]
	suppressWarnings(
	subjectsByTx <- list(
	cds = cdsBy(txdb, "tx"),
	exon = exonsBy(txdb, "tx"),
	intron = intronsByTranscript(txdb),
	fiveUTR = fiveUTRsByTranscript(txdb),
	threeUTR = threeUTRsByTranscript(txdb),
	promoter = promoters(tx, upstream=upstream(promoter.region), downstream=downstream(promoter.region)),
	tss = promoters(tx, upstream=0, downstream=1)))
	subjectsByTx$noncoding <- subjectsByTx$exon[setdiff(names(subjectsByTx$exon), names(subjectsByTx$cds))]
	subjectsByGene <- llply(subjectsByTx, function(subj) {
	if (is(subj, "GRangesList")) {
	split(unlist(subj), tx2entrez[rep(names(subj), elementLengths(subj))])
	} else {
	split(subj, tx2entrez[names(subj)])
	}
	}, .parallel=TRUE)
	hitlists <- llply(subjectsByGene, function(subj) {
	ol <- findOverlaps(peaks, reduce(subj), select="all")
	CharacterList(split(names(subj)[subjectHits(ol)], names(peaks)[queryHits(ol)]))
	}, .parallel=TRUE)
	## Feature types earlier in the list have priority
	feature.priority <- c("tss", "promoter", "fiveUTR", "cds", "threeUTR", "noncoding", "exon", "intron")
	res <- DataFrame(
	row.names=names(peaks), peak_id=names(peaks),
	OverlapEntrez=CharacterList(NA), OverlapType=factor(NA, levels=feature.priority),
	NearTSSEntrez=as.character(NA), DistanceToTSS=as.numeric(NA))
	for (i in rev(feature.priority)) {
	hitpeaks <- names(hitlists[[i]])
	res[hitpeaks,"OverlapEntrez"] <- hitlists[[i]]
	res[hitpeaks,"OverlapType"] <- i
	}

	tss.only.with.entrez <- unlist(unname(subjectsByGene$tss))
	nearest.tss <- distanceToNearest(peaks, tss.only.with.entrez, select="arbitrary")
	nearest.tss.txid <- names(tss.only.with.entrez)[subjectHits(nearest.tss)]
	mcols(nearest.tss)$ENTREZ <- tx2entrez[nearest.tss.txid]
	res[queryHits(nearest.tss),c("NearTSSEntrez", "DistanceToTSS")] <-
	mcols(nearest.tss)[c("ENTREZ", "distance")]
	mcols(peaks) <- res
	peaks
	}