jwbargsten/makeTranscriptDbFromGFF3.R

## makeTranscriptDbFromGFF3.R
## source: http://permalink.gmane.org/gmane.science.biology.informatics.conductor/40247
require(plyr)
require(rtracklayer)
require(GenomicRanges)
require(GenomicFeatures)

## determine the rank for all exons of a transcript
## the rank is reversed, if the strand is negative
.exon_rank_order <- function(exon.starts, strands) { order(exon.starts, decreasing=strands[1] == '-') }

makeTranscriptDbFromGFF3 <- function(file, allow.unknown.strand=TRUE, metadata=data.frame(name="genome", value="unknown_species")) {
    data <- import(file, format="gff3", asRangedData=FALSE)

    ## omit everything with unknown strand
    if(!allow.unknown.strand)
        data <- data[which( strand(data) %in% c('+','-'))]

    ## get the corresponding subsets
    data.mrna <- data[values(data)$type == 'mRNA']
    data.exon <- data[values(data)$type == 'exon']

    ## build the transcripts data frame
    transcripts <- data.frame(
                              tx_id=seq(along=data.mrna),
                              tx_name=as.character(values(data.mrna)$ID),
                              tx_chrom=as.character(seqnames(data.mrna)),
                              tx_strand=as.character(strand(data.mrna)),
                              tx_start=min(ranges(data.mrna)),
                              tx_end=max(ranges(data.mrna))
    )

    ## build the basic structure for the splicings data frame
    ## take care of exons with more than one parent (**1**)
    exon.parentList <- as.list(values(data.exon)$Parent)
    exon.parentList.idxmap <- unlist(lapply(seq(along=exon.parentList), function(x) { rep(x, length(exon.parentList[[x]])) }))

    exons.raw <- data.exon[exon.parentList.idxmap]
    exon.tmp <- data.frame(
        exon_name=as.character(values(exons.raw)$ID),
        exon_strand=as.character(strand(exons.raw)),
        exon_start=start(ranges(exons.raw)),
        exon_end=end(ranges(exons.raw)),
        exon_parent=unlist(exon.parentList),
        stringsAsFactors=FALSE
    )

    ## determine the corresponding transcript indices ...
    exon.tmp <- transform(exon.tmp, tx_idx=match(exon_parent, transcripts$tx_name))
    ## ... and use them to assign the transcript ids and the strands
    exon.tmp <- transform(exon.tmp, tx_id=transcripts$tx_id[tx_idx], tx_strand=transcripts$tx_strand[tx_idx])
    ## every exon rank is based on the start coord. and the strand of the corresponding transcript (the parent feature)
    exon.tmp <- ddply(exon.tmp, .(exon_strand, tx_id), transform, exon_rank = .exon_rank_order(exon_start, tx_strand))

    ## extract the relevant columns for the splicings data frame
    splicings <- exon.tmp[c("tx_id", "exon_rank", "exon_strand", "exon_start", "exon_end")]

    ## just bluntly assume that transcripts have not more than one parent (the gene), hence use "unlist()" directly
    ## and skip the stuff done in (**1**)
    genes <- data.frame(tx_id=seq(along=data.mrna), gene_id=unlist(values(data.mrna)$Parent))

    ## extract chromosome names
    chrom <- levels(seqnames(data))

    db <- makeTranscriptDb(
        transcripts,
        splicings,
        genes=genes,
        metadata=metadata,
        chrominfo=data.frame(chrom=chrom, length=rep(NA, length(chrom)))
    )

    db
}

## TEST IT
file <- system.file("tests", "genes.gff3",package="rtracklayer")
txdb <- makeTranscriptDbFromGFF3(file)
	## source: http://permalink.gmane.org/gmane.science.biology.informatics.conductor/40247
	require(plyr)
	require(rtracklayer)
	require(GenomicRanges)
	require(GenomicFeatures)

	## determine the rank for all exons of a transcript
	## the rank is reversed, if the strand is negative
	.exon_rank_order <- function(exon.starts, strands) { order(exon.starts, decreasing=strands[1] == '-') }

	makeTranscriptDbFromGFF3 <- function(file, allow.unknown.strand=TRUE, metadata=data.frame(name="genome", value="unknown_species")) {
	data <- import(file, format="gff3", asRangedData=FALSE)

	## omit everything with unknown strand
	if(!allow.unknown.strand)
	data <- data[which( strand(data) %in% c('+','-'))]

	## get the corresponding subsets
	data.mrna <- data[values(data)$type == 'mRNA']
	data.exon <- data[values(data)$type == 'exon']

	## build the transcripts data frame
	transcripts <- data.frame(
	tx_id=seq(along=data.mrna),
	tx_name=as.character(values(data.mrna)$ID),
	tx_chrom=as.character(seqnames(data.mrna)),
	tx_strand=as.character(strand(data.mrna)),
	tx_start=min(ranges(data.mrna)),
	tx_end=max(ranges(data.mrna))
	)

	## build the basic structure for the splicings data frame
	## take care of exons with more than one parent (1)
	exon.parentList <- as.list(values(data.exon)$Parent)
	exon.parentList.idxmap <- unlist(lapply(seq(along=exon.parentList), function(x) { rep(x, length(exon.parentList[[x]])) }))

	exons.raw <- data.exon[exon.parentList.idxmap]
	exon.tmp <- data.frame(
	exon_name=as.character(values(exons.raw)$ID),
	exon_strand=as.character(strand(exons.raw)),
	exon_start=start(ranges(exons.raw)),
	exon_end=end(ranges(exons.raw)),
	exon_parent=unlist(exon.parentList),
	stringsAsFactors=FALSE
	)

	## determine the corresponding transcript indices ...
	exon.tmp <- transform(exon.tmp, tx_idx=match(exon_parent, transcripts$tx_name))
	## ... and use them to assign the transcript ids and the strands
	exon.tmp <- transform(exon.tmp, tx_id=transcripts$tx_id[tx_idx], tx_strand=transcripts$tx_strand[tx_idx])
	## every exon rank is based on the start coord. and the strand of the corresponding transcript (the parent feature)
	exon.tmp <- ddply(exon.tmp, .(exon_strand, tx_id), transform, exon_rank = .exon_rank_order(exon_start, tx_strand))

	## extract the relevant columns for the splicings data frame
	splicings <- exon.tmp[c("tx_id", "exon_rank", "exon_strand", "exon_start", "exon_end")]

	## just bluntly assume that transcripts have not more than one parent (the gene), hence use "unlist()" directly
	## and skip the stuff done in (1)
	genes <- data.frame(tx_id=seq(along=data.mrna), gene_id=unlist(values(data.mrna)$Parent))

	## extract chromosome names
	chrom <- levels(seqnames(data))

	db <- makeTranscriptDb(
	transcripts,
	splicings,
	genes=genes,
	metadata=metadata,
	chrominfo=data.frame(chrom=chrom, length=rep(NA, length(chrom)))
	)

	db
	}

	## TEST IT
	file <- system.file("tests", "genes.gff3",package="rtracklayer")
	txdb <- makeTranscriptDbFromGFF3(file)