Skip to content

Instantly share code, notes, and snippets.

Created May 23, 2014 18:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/0cd0c926fb21a8d62fcc to your computer and use it in GitHub Desktop.
Save anonymous/0cd0c926fb21a8d62fcc to your computer and use it in GitHub Desktop.
annotatePeaks <- function(peaks, txdb, promoter.region=PromoterVariants()) {
if (is.numeric(promoter.region)) {
if (length(promoter.region) < 2)
promoter.region <- unname(rep(promoter.region, length.out=2))
promoter.region <- do.call(PromoterVariants, as.list(promoter.region))
}
tx <- transcripts(txdb)
names(tx) <- tx$tx_id
stopifnot(all(seqlevels(peaks) %in% seqlevels(txdb)))
txbygene <- transcriptsBy(txdb, "gene")
tx2entrez <- setNames(rep(names(txbygene), elementLengths(txbygene)),
unlist(txbygene)$tx_id)
tx$gene_id <- tx2entrez[names(tx)]
suppressWarnings(
subjectsByTx <- list(
cds = cdsBy(txdb, "tx"),
exon = exonsBy(txdb, "tx"),
intron = intronsByTranscript(txdb),
fiveUTR = fiveUTRsByTranscript(txdb),
threeUTR = threeUTRsByTranscript(txdb),
promoter = promoters(tx, upstream=upstream(promoter.region), downstream=downstream(promoter.region)),
tss = promoters(tx, upstream=0, downstream=1)))
subjectsByTx$noncoding <- subjectsByTx$exon[setdiff(names(subjectsByTx$exon), names(subjectsByTx$cds))]
subjectsByGene <- llply(subjectsByTx, function(subj) {
if (is(subj, "GRangesList")) {
split(unlist(subj), tx2entrez[rep(names(subj), elementLengths(subj))])
} else {
split(subj, tx2entrez[names(subj)])
}
}, .parallel=TRUE)
hitlists <- llply(subjectsByGene, function(subj) {
ol <- findOverlaps(peaks, reduce(subj), select="all")
CharacterList(split(names(subj)[subjectHits(ol)], names(peaks)[queryHits(ol)]))
}, .parallel=TRUE)
## Feature types earlier in the list have priority
feature.priority <- c("tss", "promoter", "fiveUTR", "cds", "threeUTR", "noncoding", "exon", "intron")
res <- DataFrame(
row.names=names(peaks), peak_id=names(peaks),
OverlapEntrez=CharacterList(NA), OverlapType=factor(NA, levels=feature.priority),
NearTSSEntrez=as.character(NA), DistanceToTSS=as.numeric(NA))
for (i in rev(feature.priority)) {
hitpeaks <- names(hitlists[[i]])
res[hitpeaks,"OverlapEntrez"] <- hitlists[[i]]
res[hitpeaks,"OverlapType"] <- i
}
tss.only.with.entrez <- unlist(unname(subjectsByGene$tss))
nearest.tss <- distanceToNearest(peaks, tss.only.with.entrez, select="arbitrary")
nearest.tss.txid <- names(tss.only.with.entrez)[subjectHits(nearest.tss)]
mcols(nearest.tss)$ENTREZ <- tx2entrez[nearest.tss.txid]
res[queryHits(nearest.tss),c("NearTSSEntrez", "DistanceToTSS")] <-
mcols(nearest.tss)[c("ENTREZ", "distance")]
mcols(peaks) <- res
peaks
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment