Skip to content

Instantly share code, notes, and snippets.

@astatham
Created August 29, 2011 09:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save astatham/1178072 to your computer and use it in GitHub Desktop.
Save astatham/1178072 to your computer and use it in GitHub Desktop.
cuff 2 db
cufflinks2TranscriptDB <- function(filename, verbose=TRUE) {
require(rtracklayer)
require(GenomicRanges)
require(GenomicFeatures)
requiredAttribs <- c("gene_id", "transcript_id", "exon_number")
if (verbose) message("Importing ", filename)
tmp <- import.gff(filename, asRangedData=FALSE)
#parse the per exon attributes
if (verbose) message("Parsing gene/transcript/exon ids")
tmpx <- strsplit(gsub(";", "", gsub("\"", "", values(tmp)$group)), split=" ")
tmpx <- lapply(tmpx, function(x) {
ans <- x[1:(length(x)/2)*2]
names(ans) <- x[1:(length(x)/2)*2-1]
ans
})
attribs <- unique(unlist(lapply(tmpx, names)))
if (!all(requiredAttribs %in% attribs)) stop("Not all required attributes are in this GFF file")
names(attribs) <- attribs
tmpx2 <- do.call(data.frame, c(lapply(attribs, function(x) sapply(tmpx, "[", x)), stringsAsFactors=FALSE))
values(tmp) <- tmpx2
if (verbose) message("Creating tables")
#make transcripts table
tmpT <- split(tmp, values(tmp)$transcript_id)
transcripts <- data.frame(
tx_id=1:length(tmpT),
tx_name=names(tmpT),
tx_chrom=as.character(seqnames(unlist(tmpT))[start(tmpT@partitioning)]),
tx_strand=as.character(strand(unlist(tmpT))[start(tmpT@partitioning)]),
tx_start=sapply(start(ranges(tmpT)), min),
tx_end=sapply(end(ranges(tmpT)), max),
stringsAsFactors=FALSE)
#make splicings table
splicings <- data.frame(
tx_id=rep(1:length(tmpT), elementLengths(tmpT)),
exon_rank=as.integer(values(unlist(tmpT))$exon_number),
exon_chrom=as.character(seqnames(unlist(tmpT))),
exon_strand=as.character(strand(unlist(tmpT))),
exon_start=start(unlist(tmpT)),
exon_end=end(unlist(tmpT)),
stringsAsFactors=FALSE)
#make genes table
gene_txs <- tapply(values(tmp)$transcript_id, values(tmp)$gene_id, unique)
genes <- data.frame(
tx_name=unlist(gene_txs),
gene_id=rep(names(gene_txs), sapply(gene_txs, length)),
stringsAsFactors=FALSE)
#create the db
if (verbose) message("Creating TranscriptDb")
tmpdb <- makeTranscriptDb(transcripts, splicings, genes=genes)
if (verbose) message("Use saveFeatures() to save the database to a file")
tmpdb
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment