Skip to content

Instantly share code, notes, and snippets.

@jwbargsten
Created September 18, 2012 15:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jwbargsten/3743732 to your computer and use it in GitHub Desktop.
Save jwbargsten/3743732 to your computer and use it in GitHub Desktop.
Create TranscriptDb objects from GFF3 files
## source: http://permalink.gmane.org/gmane.science.biology.informatics.conductor/40247
require(plyr)
require(rtracklayer)
require(GenomicRanges)
require(GenomicFeatures)
## determine the rank for all exons of a transcript
## the rank is reversed, if the strand is negative
.exon_rank_order <- function(exon.starts, strands) { order(exon.starts, decreasing=strands[1] == '-') }
makeTranscriptDbFromGFF3 <- function(file, allow.unknown.strand=TRUE, metadata=data.frame(name="genome", value="unknown_species")) {
data <- import(file, format="gff3", asRangedData=FALSE)
## omit everything with unknown strand
if(!allow.unknown.strand)
data <- data[which( strand(data) %in% c('+','-'))]
## get the corresponding subsets
data.mrna <- data[values(data)$type == 'mRNA']
data.exon <- data[values(data)$type == 'exon']
## build the transcripts data frame
transcripts <- data.frame(
tx_id=seq(along=data.mrna),
tx_name=as.character(values(data.mrna)$ID),
tx_chrom=as.character(seqnames(data.mrna)),
tx_strand=as.character(strand(data.mrna)),
tx_start=min(ranges(data.mrna)),
tx_end=max(ranges(data.mrna))
)
## build the basic structure for the splicings data frame
## take care of exons with more than one parent (**1**)
exon.parentList <- as.list(values(data.exon)$Parent)
exon.parentList.idxmap <- unlist(lapply(seq(along=exon.parentList), function(x) { rep(x, length(exon.parentList[[x]])) }))
exons.raw <- data.exon[exon.parentList.idxmap]
exon.tmp <- data.frame(
exon_name=as.character(values(exons.raw)$ID),
exon_strand=as.character(strand(exons.raw)),
exon_start=start(ranges(exons.raw)),
exon_end=end(ranges(exons.raw)),
exon_parent=unlist(exon.parentList),
stringsAsFactors=FALSE
)
## determine the corresponding transcript indices ...
exon.tmp <- transform(exon.tmp, tx_idx=match(exon_parent, transcripts$tx_name))
## ... and use them to assign the transcript ids and the strands
exon.tmp <- transform(exon.tmp, tx_id=transcripts$tx_id[tx_idx], tx_strand=transcripts$tx_strand[tx_idx])
## every exon rank is based on the start coord. and the strand of the corresponding transcript (the parent feature)
exon.tmp <- ddply(exon.tmp, .(exon_strand, tx_id), transform, exon_rank = .exon_rank_order(exon_start, tx_strand))
## extract the relevant columns for the splicings data frame
splicings <- exon.tmp[c("tx_id", "exon_rank", "exon_strand", "exon_start", "exon_end")]
## just bluntly assume that transcripts have not more than one parent (the gene), hence use "unlist()" directly
## and skip the stuff done in (**1**)
genes <- data.frame(tx_id=seq(along=data.mrna), gene_id=unlist(values(data.mrna)$Parent))
## extract chromosome names
chrom <- levels(seqnames(data))
db <- makeTranscriptDb(
transcripts,
splicings,
genes=genes,
metadata=metadata,
chrominfo=data.frame(chrom=chrom, length=rep(NA, length(chrom)))
)
db
}
## TEST IT
file <- system.file("tests", "genes.gff3",package="rtracklayer")
txdb <- makeTranscriptDbFromGFF3(file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment