Skip to content

Instantly share code, notes, and snippets.

@ATpoint
Created May 24, 2020 09:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ATpoint/129fb17a6c0ed07a4e63bdb943a6d37f to your computer and use it in GitHub Desktop.
Save ATpoint/129fb17a6c0ed07a4e63bdb943a6d37f to your computer and use it in GitHub Desktop.
## Script to create a transcriptome fasta file that contains both spliced and unspliced
## transcripts based on a GTF file from GENCODE.
## Also see the preprint https://doi.org/10.1101/2020.03.13.990069 from Charlotte Soneson on the matter of
## how preprocessing influences the outcome of velocity analysis.
## The below steps are inspired by the preprint and personal correspondence with CS.
ANALYSISDIR="./"
dir.create(paste0(ANALYSISDIR, "/Alevin_IDX"))
## eisaR: devtools::install_github("https://github.com/fmicompbio/eisaR", ref = "0de32247a0640336b73a92a52c8ad993c1012024")
library(eisaR)
library(seqinr)
library(Biostrings)
library(rtracklayer)
library(RCurl)
library(curl)
library(BSgenome.Mmusculus.UCSC.mm10)
################################################################################################################################
## Get the GTF from GENCODE (we use v23 to be consistent with our bulk RNA-seq data):
URL.GTF = "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M23/gencode.vM23.annotation.gtf.gz"
con = curl(URL.GTF, "r", new_handle(dirlistonly=TRUE))
## First we extract the spliced transcripts
mm10.tx <- extractTxSeqs(gtf = URL.GTF,
type = "spliced",
genome = BSgenome.Mmusculus.UCSC.mm10)
## now extract introns with eisaR::extractIntronSeqs.
## flanklength is the read length of the R2 read, so 98 based on the 10X recommendations
mm10.introns <- extractIntronSeqs(gtf = URL.GTF,
genome = BSgenome.Mmusculus.UCSC.mm10,
type = "collapse",
flanklength = 98 - 1)
## We remove the version info from the Ensembl transcript names and
## add the "_Intron" suffix to separate it from the spliced transcripts
names(mm10.tx) <- gsub("\\..*", "", names(mm10.tx))
names(mm10.introns) <- sapply(strsplit(names(mm10.introns), split = "-"), function(x){
return(paste( gsub("\\..*", "", x[1]), gsub("I", "Intron", x[2]), sep = "_" ))
})
## combine into one fasta file, write to disk, file is then ready for indexing with Salmon/Alevin
writeXStringSet(x = c(mm10.tx, mm10.introns), compress = FALSE,
filepath = paste0(ANALYSISDIR, "/Alevin_IDX/tx_withIntrons.fa"))
## Make a tx2gene map (for tximport) and
## a list of mitochondrial and rRNA gene
gtf <- import(con = URL.GTF, format = "GTF")
## select transcripts:
gtf.tx <- gtf[gtf$type == "transcript",]
## spliced transcripts. we make gene names of the form ENSMUSG_MGI,
## so a name where both the Ensembl gene id and the MGI gene name are included
spliced.tx2gene <- data.frame(Tx = gsub("\\..*", "", gtf.tx$transcript_id),
Gene = paste(gsub("\\..*", "", gtf.tx$gene_id), gtf.tx$gene_name, sep="_"),
stringsAsFactors = FALSE)
## unspliced transcripts:
unspliced.tx2gene <- data.frame(Tx = names(mm10.introns),
Gene = sapply(strsplit(names(mm10.introns), split = "_Intron"), function(x)paste0(x[1])))
## combine:
unspliced.tx2gene <- merge(x = unspliced.tx2gene,
y = data.frame(spliced.tx2gene,
Tmp = sapply(strsplit(spliced.tx2gene$Gene, split = "_"), function(x)x[1])),
by.x = "Gene", by.y = "Tmp")
## final tx2gene data.frame:
tx2gene <- rbind(spliced.tx2gene, data.frame(Tx = unspliced.tx2gene$Tx.x,
Gene = paste0(unspliced.tx2gene$Gene.y, "_unspliced")))
## to disk:
write.table(x = tx2gene, sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE,
file = paste0(ANALYSISDIR, "/Alevin_IDX/tgmap.txt"))
## Get the mtGenes and rRNA:
tmp.mt <- gtf[seqnames(gtf) == "chrM",]
tmp.mt <- unique(paste(gsub("\\..*", "", tmp.mt$gene_id), tmp.mt$gene_name, sep="_"))
tmp.rR <- gtf[gtf$gene_type == "rRNA",]
tmp.rR <- unique(paste(gsub("\\..*", "", tmp.rR$gene_id), tmp.rR$gene_name, sep="_"))
## Save as RDS:
if(!dir.exists(paste0(ANALYSISDIR, "/RDS"))) dir.create(paste0(ANALYSISDIR, "/RDS"))
saveRDS(object = list(mt_Genes = tmp.mt,
rRNA_Genes = tmp.rR),
file = (paste0(ANALYSISDIR, "/RDS/mt_rRNA_Genes.rds")))
write.table(x = data.frame(mtGenes = tmp.mt),
sep = "\n", col.names = FALSE, row.names = FALSE, quote = FALSE,
file = paste0(ANALYSISDIR, "/Alevin_IDX/mtGenes.txt"))
write.table(x = data.frame(rRNA = tmp.rR),
sep = "\n", col.names = FALSE, row.names = FALSE, quote = FALSE,
file = paste0(ANALYSISDIR, "/Alevin_IDX/rrnaGenes.txt"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment