Skip to content

Instantly share code, notes, and snippets.

Created July 2, 2013 15:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/5910307 to your computer and use it in GitHub Desktop.
Save anonymous/5910307 to your computer and use it in GitHub Desktop.
library(data.table)
library(methods)
library(GenomicRanges)
library(rtracklayer)
setAs("GRanges", "data.table", function(from) {
if (length(from) == 0L) {
return(data.table())
}
as.data.table(as.data.frame(from))
})
setAs("data.table", "GRanges", function(from) {
as(as.data.frame(from), "GRanges")
})
setAs("data.frame", "GRanges", function(from) {
if (nrow(from) == 0L || all(is.na(from))) {
return(GRanges())
}
if (!'seqnames' %in% colnames(from)) {
stop("seqnames required")
}
gr.meta.take <- colnames(from) %in% c('seqnames', 'strand')
gr.meta <- from[, gr.meta.take, drop=FALSE]
from <- from[, !gr.meta.take, drop=FALSE]
.ranges <- as(from, 'IRanges')
DF <- elementMetadata(.ranges)
elementMetadata(.ranges) <- NULL
.strand <- if ('strand' %in% colnames(gr.meta)) gr.meta$strand else '*'
gr <- GRanges(seqnames=gr.meta$seqnames, ranges=.ranges, strand=.strand)
mcols(gr) <- DF
gr
})
gtf.add_p_id_tss_id<-function(input.gtf,output.gtf) {
## PURPOSE: Create a copy of <input.gtf>, a GTF formatted file, with
## additional attributes p_id and tss_id in column 9, writing result
## to <output.gtf>. These attributes are necessary for cuffdiff to
## perform all the differential splicing/coding/expression contrasts
## - if the attributes are missing, these contrasts are skipped.
##
## NOTE: cuffdiff's documented way of adding these attributes is to
## create a .combined.gtf file using `cuffcompare`, but this method
## unfortunately (unnecessarily!) resets the gene_id and
## transcript_id to newly generated unique values.
##
## TODO: consider naming the tss_id/p_id after the ENSG, such as
## ENSG<xxxx>:tss:<000n>
##
## AUTHOR: malcolm_cook@stowers.org
gtf.dt<-as(import(input.gtf,'gtf',asRangedData=FALSE),'data.table')
setkey(gtf.dt,gene_id,transcript_id,type,start)
gtf.dt[,transcript_tss:=ifelse('-' == strand,max(end),min(start))
,by=list(gene_id,transcript_id)]
gtf.dt[,tss_id:=paste0('tss_',.GRP)
,by=list(gene_id,transcript_id,transcript_tss)]
gtf.dt[,cds_path:=.SD[type=='CDS',paste(start,end,sep='-',collapse='^')]
## which identifies the pre-image of a protein
,by=list(gene_id,transcript_id)]
gtf.dt[cds_path!='',p_id:=paste0('p_',.GRP)
,by=list(gene_id,transcript_id,cds_path)]
gtf.dt[,transcript_tss:=NULL] # So as not to be exported
# since it is longer needed.
gtf.dt[,cds_path:=NULL] # Likewise.
gtf.gr<-as(gtf.dt,'GRanges')
export(gtf.gr,output.gtf,'gtf')
gtf.gr
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment