Created
July 2, 2013 15:31
-
-
Save anonymous/5910307 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(methods) | |
library(GenomicRanges) | |
library(rtracklayer) | |
setAs("GRanges", "data.table", function(from) { | |
if (length(from) == 0L) { | |
return(data.table()) | |
} | |
as.data.table(as.data.frame(from)) | |
}) | |
setAs("data.table", "GRanges", function(from) { | |
as(as.data.frame(from), "GRanges") | |
}) | |
setAs("data.frame", "GRanges", function(from) { | |
if (nrow(from) == 0L || all(is.na(from))) { | |
return(GRanges()) | |
} | |
if (!'seqnames' %in% colnames(from)) { | |
stop("seqnames required") | |
} | |
gr.meta.take <- colnames(from) %in% c('seqnames', 'strand') | |
gr.meta <- from[, gr.meta.take, drop=FALSE] | |
from <- from[, !gr.meta.take, drop=FALSE] | |
.ranges <- as(from, 'IRanges') | |
DF <- elementMetadata(.ranges) | |
elementMetadata(.ranges) <- NULL | |
.strand <- if ('strand' %in% colnames(gr.meta)) gr.meta$strand else '*' | |
gr <- GRanges(seqnames=gr.meta$seqnames, ranges=.ranges, strand=.strand) | |
mcols(gr) <- DF | |
gr | |
}) | |
gtf.add_p_id_tss_id<-function(input.gtf,output.gtf) { | |
## PURPOSE: Create a copy of <input.gtf>, a GTF formatted file, with | |
## additional attributes p_id and tss_id in column 9, writing result | |
## to <output.gtf>. These attributes are necessary for cuffdiff to | |
## perform all the differential splicing/coding/expression contrasts | |
## - if the attributes are missing, these contrasts are skipped. | |
## | |
## NOTE: cuffdiff's documented way of adding these attributes is to | |
## create a .combined.gtf file using `cuffcompare`, but this method | |
## unfortunately (unnecessarily!) resets the gene_id and | |
## transcript_id to newly generated unique values. | |
## | |
## TODO: consider naming the tss_id/p_id after the ENSG, such as | |
## ENSG<xxxx>:tss:<000n> | |
## | |
## AUTHOR: malcolm_cook@stowers.org | |
gtf.dt<-as(import(input.gtf,'gtf',asRangedData=FALSE),'data.table') | |
setkey(gtf.dt,gene_id,transcript_id,type,start) | |
gtf.dt[,transcript_tss:=ifelse('-' == strand,max(end),min(start)) | |
,by=list(gene_id,transcript_id)] | |
gtf.dt[,tss_id:=paste0('tss_',.GRP) | |
,by=list(gene_id,transcript_id,transcript_tss)] | |
gtf.dt[,cds_path:=.SD[type=='CDS',paste(start,end,sep='-',collapse='^')] | |
## which identifies the pre-image of a protein | |
,by=list(gene_id,transcript_id)] | |
gtf.dt[cds_path!='',p_id:=paste0('p_',.GRP) | |
,by=list(gene_id,transcript_id,cds_path)] | |
gtf.dt[,transcript_tss:=NULL] # So as not to be exported | |
# since it is longer needed. | |
gtf.dt[,cds_path:=NULL] # Likewise. | |
gtf.gr<-as(gtf.dt,'GRanges') | |
export(gtf.gr,output.gtf,'gtf') | |
gtf.gr | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment