Skip to content

Instantly share code, notes, and snippets.

@DarioS
Created May 23, 2011 01:40
Show Gist options
  • Save DarioS/986084 to your computer and use it in GitHub Desktop.
Save DarioS/986084 to your computer and use it in GitHub Desktop.
Convert TopHat GTF to Bioconductor TranscriptDB
GTF2TranscriptDB <- function(gtf.file, out.file = NULL, verbose = TRUE)
{
require(rtracklayer)
require(GenomicRanges)
require(GenomicFeatures)
min.info <- c("gene_id", "transcript_id", "exon_number")
if (verbose) message("Importing ", gtf.file)
gtf <- import.gff(gtf.file, asRangedData = FALSE)
#parse the per exon attributes
if (verbose) message("Parsing gene/transcript/exon ids.")
exon.info <- strsplit(gsub("\"|;", "", values(gtf)$group), split=" ")
exon.info <- lapply(exon.info, function(x) {
data <- x[seq(2, length(x), 2)]
names(data) <- x[seq(1, length(x), 2)]
data
})
attribs <- names(exon.info[[1]])
if (!all(min.info %in% attribs)) stop("Not all required attributes are in this GTF file.")
exon.info <- do.call(data.frame, c(lapply(attribs, function(x)
sapply(exon.info, "[", x)),
stringsAsFactors = FALSE))
colnames(exon.info) <- attribs
values(gtf) <- exon.info
if (verbose) message("Creating tables.")
#make transcripts table
exons.by.tx <- split(gtf, values(gtf)$transcript_id)
transcripts <- data.frame(
tx_id = 1:length(exons.by.tx),
tx_name = names(exons.by.tx),
tx_chrom = as.character(seqnames(unlist(exons.by.tx))[start(exons.by.tx@partitioning)]),
tx_strand = as.character(strand(unlist(exons.by.tx))[start(exons.by.tx@partitioning)]),
tx_start = IRanges::sapply(start(ranges(exons.by.tx)), min),
tx_end = IRanges::sapply(end(ranges(exons.by.tx)), max),
stringsAsFactors = FALSE)
#make exons table
exons.ord <- unlist(exons.by.tx)
splicings <- data.frame(
tx_id = rep(1:length(exons.by.tx), elementLengths(exons.by.tx)),
exon_rank = as.integer(values(exons.ord)$exon_number),
exon_chrom = as.character(seqnames(exons.ord)),
exon_strand = as.character(strand(exons.ord)),
exon_start = start(exons.ord),
exon_end = end(exons.ord),
stringsAsFactors = FALSE)
#make genes table
gene.txs <- tapply(values(gtf)$transcript_id, values(gtf)$gene_id, unique)
genes <- data.frame(
tx_name = unlist(gene.txs),
gene_id = rep(names(gene.txs), sapply(gene.txs, length)),
stringsAsFactors = FALSE)
#create the db
if (verbose) message("Creating TranscriptDb.")
gtf.db <- makeTranscriptDb(transcripts, splicings, genes)
if(!is.null(out.file))
{
if (verbose) message("Writing database to file.")
saveFeatures(gtf.db, out.file)
}
gtf.db
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment