Last active
July 12, 2016 13:40
-
-
Save boboppie/5466ea2ed8a2c54aa3d12e0a85dfcc6e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/Rscript | |
# Author: Fengyuan Hu | |
# Search ORFs from a given transcript sequences, covert local coordinates to genomic coordinates in BED format | |
# Input: | |
# Transcriptome (protein coding + lincRNAs) in Fasta format | |
# Transcriptome (protein coding + lincRNAs) annotation in GTF format | |
# Make sure annotation is consistent with sequences, or extract sequences from genome? | |
suppressPackageStartupMessages(library(riboSeqR)) | |
suppressPackageStartupMessages(library(GenomicFeatures)) | |
suppressPackageStartupMessages(library(rtracklayer)) | |
setwd("~/tmp") | |
# Find novel ORFs | |
# The findCDS function defines (from a fasta file of the transcriptome) all possible coding sequences with a start and stop codon in frame. | |
refTranscriptomeFile <- "~/ref/GENCODE/mouse/M8/fasta/transcriptome/CHR/proteincoding_and_lncRNA-transcript/pc_and_lncRNA_transcripts.fa" | |
allORFs.transcriptome <- findCDS(fastaFile = refTranscriptomeFile, startCodon = c("ATG", "CTG", "TTG", "GTG"), stopCodon = c("TAG", "TAA", "TGA")) | |
gtfFile <- "~/ref/GENCODE/mouse/M8/annotation/CHR/comprehensive/annotation.gtf" | |
mouse.txdb <- makeTxDbFromGFF(gtfFile, format = "gtf", organism = "Mus musculus") | |
###### TESTING ###### | |
## not run ## | |
#refTranscriptomeFile <- "~/ref/GENCODE/mouse/M8/fasta/transcriptome/CHR/proteincoding_and_lncRNA-transcript/test/Cxcr4.fa" | |
#allORFs.transcriptome <- findCDS(fastaFile = refTranscriptomeFile, startCodon = c("ATG"), stopCodon = c("TAG", "TAA", "TGA")) | |
#gtfFile <- "~/ref/GENCODE/mouse/M8/annotation/CHR/protein_coding/protein_coding.filtered.noSelenocysteine.chr1.gtf" | |
#mouse.txdb <- makeTxDbFromGFF(gtfFile, format = "gtf", organism = "Mus musculus") | |
###### TESTING! ###### | |
# seqlevels contains uniq seqnames | |
tx.names <- unlist(lapply(strsplit(seqlevels(allORFs.transcriptome), '[|]'), `[[`, 1)) | |
tx.exon <- exonsBy(mouse.txdb, by = "tx", use.names=T)[tx.names] | |
allORFs.transcriptome.orig <- allORFs.transcriptome | |
seqlevels(allORFs.transcriptome) <- unlist(lapply(strsplit(seqlevels(allORFs.transcriptome), '[|]'), `[[`, 1)) | |
orfToGenimic <- lapply(1:length(allORFs.transcriptome), function(i) { | |
orf <- allORFs.transcriptome[i] | |
tx.name <- runValue(seqnames(orf)) | |
tx.map <- pmapFromTranscripts(orf, tx.exon[names(tx.exon) == tx.name]) | |
names(tx.map) <- paste(tx.name, "ORF", start(orf), end(orf), mcols(orf)$frame, sep="_") | |
tx.map | |
}) | |
grl <- do.call(c, unlist(orfToGenimic, recursive=FALSE)) | |
unlisted <- unlist(grl) | |
unlisted.hit <- unlisted[which(mcols(unlisted)$hit == T)] | |
tx.map.hit.bed <- asBED(split(unlisted.hit, names(unlisted.hit))) | |
orfToBED <- export(tx.map.hit.bed, format = "bed") | |
write(orfToBED, "orfs.bed") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment