ATpoint/Get_Intronic_Tx.R

## Get_Intronic_Tx.R
## Script to create a transcriptome fasta file that contains both spliced and unspliced
## transcripts based on a GTF file from GENCODE.
## Also see the preprint https://doi.org/10.1101/2020.03.13.990069 from Charlotte Soneson on the matter of
## how preprocessing influences the outcome of velocity analysis.
## The below steps are inspired by the preprint and personal correspondence with CS.

ANALYSISDIR="./"

dir.create(paste0(ANALYSISDIR, "/Alevin_IDX"))

## eisaR: devtools::install_github("https://github.com/fmicompbio/eisaR", ref = "0de32247a0640336b73a92a52c8ad993c1012024")

library(eisaR)
library(seqinr)
library(Biostrings)
library(rtracklayer)
library(RCurl)
library(curl)
library(BSgenome.Mmusculus.UCSC.mm10)

################################################################################################################################

## Get the GTF from GENCODE (we use v23 to be consistent with our bulk RNA-seq data):
URL.GTF   = "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M23/gencode.vM23.annotation.gtf.gz"
con = curl(URL.GTF, "r", new_handle(dirlistonly=TRUE))

## First we extract the spliced transcripts
mm10.tx <- extractTxSeqs(gtf    = URL.GTF,
                         type   = "spliced",
                         genome = BSgenome.Mmusculus.UCSC.mm10)

## now extract introns with eisaR::extractIntronSeqs.
## flanklength is the read length of the R2 read, so 98 based on the 10X recommendations
mm10.introns <- extractIntronSeqs(gtf = URL.GTF,
                                  genome = BSgenome.Mmusculus.UCSC.mm10,
                                  type = "collapse",
                                  flanklength = 98 - 1)

## We remove the version info from the Ensembl transcript names and
## add the "_Intron" suffix to separate it from the spliced transcripts
names(mm10.tx) <- gsub("\\..*", "", names(mm10.tx))
names(mm10.introns) <- sapply(strsplit(names(mm10.introns), split = "-"), function(x){
  return(paste( gsub("\\..*", "", x[1]), gsub("I", "Intron", x[2]), sep = "_"  ))
})

## combine into one fasta file, write to disk, file is then ready for indexing with Salmon/Alevin
writeXStringSet(x = c(mm10.tx, mm10.introns), compress = FALSE,
                filepath = paste0(ANALYSISDIR, "/Alevin_IDX/tx_withIntrons.fa"))

## Make a tx2gene map (for tximport) and
## a list of mitochondrial and rRNA gene
gtf <- import(con = URL.GTF, format = "GTF")

## select transcripts:
gtf.tx <- gtf[gtf$type == "transcript",]

## spliced transcripts. we make gene names of the form ENSMUSG_MGI,
## so a name where both the Ensembl gene id and the MGI gene name are included
spliced.tx2gene <- data.frame(Tx = gsub("\\..*", "", gtf.tx$transcript_id),
                              Gene = paste(gsub("\\..*", "", gtf.tx$gene_id), gtf.tx$gene_name, sep="_"),
                              stringsAsFactors = FALSE)

## unspliced transcripts:
unspliced.tx2gene <- data.frame(Tx = names(mm10.introns),
                                Gene = sapply(strsplit(names(mm10.introns), split = "_Intron"), function(x)paste0(x[1])))

## combine:
unspliced.tx2gene <- merge(x = unspliced.tx2gene,
                           y = data.frame(spliced.tx2gene,
                                          Tmp = sapply(strsplit(spliced.tx2gene$Gene, split = "_"), function(x)x[1])),
                           by.x = "Gene", by.y = "Tmp")

## final tx2gene data.frame:
tx2gene <- rbind(spliced.tx2gene, data.frame(Tx = unspliced.tx2gene$Tx.x,
                                             Gene = paste0(unspliced.tx2gene$Gene.y, "_unspliced")))

## to disk:
write.table(x = tx2gene, sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE,
            file = paste0(ANALYSISDIR, "/Alevin_IDX/tgmap.txt"))

## Get the mtGenes and rRNA:
tmp.mt <- gtf[seqnames(gtf) == "chrM",]
tmp.mt <- unique(paste(gsub("\\..*", "", tmp.mt$gene_id), tmp.mt$gene_name, sep="_"))

tmp.rR <- gtf[gtf$gene_type == "rRNA",]
tmp.rR <- unique(paste(gsub("\\..*", "", tmp.rR$gene_id), tmp.rR$gene_name, sep="_"))

## Save as RDS:
if(!dir.exists(paste0(ANALYSISDIR, "/RDS"))) dir.create(paste0(ANALYSISDIR, "/RDS"))
saveRDS(object = list(mt_Genes = tmp.mt,
                      rRNA_Genes = tmp.rR),
        file = (paste0(ANALYSISDIR, "/RDS/mt_rRNA_Genes.rds")))

write.table(x = data.frame(mtGenes = tmp.mt),
            sep = "\n", col.names = FALSE, row.names = FALSE, quote = FALSE,
            file = paste0(ANALYSISDIR, "/Alevin_IDX/mtGenes.txt"))

write.table(x = data.frame(rRNA = tmp.rR),
            sep = "\n", col.names = FALSE, row.names = FALSE, quote = FALSE,
            file = paste0(ANALYSISDIR, "/Alevin_IDX/rrnaGenes.txt"))
	## Script to create a transcriptome fasta file that contains both spliced and unspliced
	## transcripts based on a GTF file from GENCODE.
	## Also see the preprint https://doi.org/10.1101/2020.03.13.990069 from Charlotte Soneson on the matter of
	## how preprocessing influences the outcome of velocity analysis.
	## The below steps are inspired by the preprint and personal correspondence with CS.

	ANALYSISDIR="./"

	dir.create(paste0(ANALYSISDIR, "/Alevin_IDX"))

	## eisaR: devtools::install_github("https://github.com/fmicompbio/eisaR", ref = "0de32247a0640336b73a92a52c8ad993c1012024")

	library(eisaR)
	library(seqinr)
	library(Biostrings)
	library(rtracklayer)
	library(RCurl)
	library(curl)
	library(BSgenome.Mmusculus.UCSC.mm10)

	################################################################################################################################

	## Get the GTF from GENCODE (we use v23 to be consistent with our bulk RNA-seq data):
	URL.GTF = "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M23/gencode.vM23.annotation.gtf.gz"
	con = curl(URL.GTF, "r", new_handle(dirlistonly=TRUE))

	## First we extract the spliced transcripts
	mm10.tx <- extractTxSeqs(gtf = URL.GTF,
	type = "spliced",
	genome = BSgenome.Mmusculus.UCSC.mm10)

	## now extract introns with eisaR::extractIntronSeqs.
	## flanklength is the read length of the R2 read, so 98 based on the 10X recommendations
	mm10.introns <- extractIntronSeqs(gtf = URL.GTF,
	genome = BSgenome.Mmusculus.UCSC.mm10,
	type = "collapse",
	flanklength = 98 - 1)

	## We remove the version info from the Ensembl transcript names and
	## add the "_Intron" suffix to separate it from the spliced transcripts
	names(mm10.tx) <- gsub("\\..*", "", names(mm10.tx))
	names(mm10.introns) <- sapply(strsplit(names(mm10.introns), split = "-"), function(x){
	return(paste( gsub("\\..*", "", x[1]), gsub("I", "Intron", x[2]), sep = "_" ))
	})

	## combine into one fasta file, write to disk, file is then ready for indexing with Salmon/Alevin
	writeXStringSet(x = c(mm10.tx, mm10.introns), compress = FALSE,
	filepath = paste0(ANALYSISDIR, "/Alevin_IDX/tx_withIntrons.fa"))

	## Make a tx2gene map (for tximport) and
	## a list of mitochondrial and rRNA gene
	gtf <- import(con = URL.GTF, format = "GTF")

	## select transcripts:
	gtf.tx <- gtf[gtf$type == "transcript",]

	## spliced transcripts. we make gene names of the form ENSMUSG_MGI,
	## so a name where both the Ensembl gene id and the MGI gene name are included
	spliced.tx2gene <- data.frame(Tx = gsub("\\..*", "", gtf.tx$transcript_id),
	Gene = paste(gsub("\\..*", "", gtf.tx$gene_id), gtf.tx$gene_name, sep="_"),
	stringsAsFactors = FALSE)

	## unspliced transcripts:
	unspliced.tx2gene <- data.frame(Tx = names(mm10.introns),
	Gene = sapply(strsplit(names(mm10.introns), split = "_Intron"), function(x)paste0(x[1])))

	## combine:
	unspliced.tx2gene <- merge(x = unspliced.tx2gene,
	y = data.frame(spliced.tx2gene,
	Tmp = sapply(strsplit(spliced.tx2gene$Gene, split = "_"), function(x)x[1])),
	by.x = "Gene", by.y = "Tmp")

	## final tx2gene data.frame:
	tx2gene <- rbind(spliced.tx2gene, data.frame(Tx = unspliced.tx2gene$Tx.x,
	Gene = paste0(unspliced.tx2gene$Gene.y, "_unspliced")))

	## to disk:
	write.table(x = tx2gene, sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE,
	file = paste0(ANALYSISDIR, "/Alevin_IDX/tgmap.txt"))

	## Get the mtGenes and rRNA:
	tmp.mt <- gtf[seqnames(gtf) == "chrM",]
	tmp.mt <- unique(paste(gsub("\\..*", "", tmp.mt$gene_id), tmp.mt$gene_name, sep="_"))

	tmp.rR <- gtf[gtf$gene_type == "rRNA",]
	tmp.rR <- unique(paste(gsub("\\..*", "", tmp.rR$gene_id), tmp.rR$gene_name, sep="_"))

	## Save as RDS:
	if(!dir.exists(paste0(ANALYSISDIR, "/RDS"))) dir.create(paste0(ANALYSISDIR, "/RDS"))
	saveRDS(object = list(mt_Genes = tmp.mt,
	rRNA_Genes = tmp.rR),
	file = (paste0(ANALYSISDIR, "/RDS/mt_rRNA_Genes.rds")))

	write.table(x = data.frame(mtGenes = tmp.mt),
	sep = "\n", col.names = FALSE, row.names = FALSE, quote = FALSE,
	file = paste0(ANALYSISDIR, "/Alevin_IDX/mtGenes.txt"))

	write.table(x = data.frame(rRNA = tmp.rR),
	sep = "\n", col.names = FALSE, row.names = FALSE, quote = FALSE,
	file = paste0(ANALYSISDIR, "/Alevin_IDX/rrnaGenes.txt"))