slavailn/get_longest_transcript.R

## get_longest_transcript.R
library("GenomicFeatures")
library("DESeq2")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("BSgenome.Hsapiens.UCSC.hg19")
library("biomaRt")

setwd("<working_dir>")

# Assuming we are working with DESeq2 projects
# get all genes expressed in the project
load("DESeq2_object.RData")
counts <- counts(dds)
dim(counts) # 35087 genes
# Get ensembl ids for all of the genes expressed in the project
ids <- rownames(counts)

TxDb.Hsapiens.UCSC.hg19.knownGene
# Get all transcripts per genes using GenomicFeatures
transcriptsByGene <- transcriptsBy(TxDb.Hsapiens.UCSC.hg19.knownGene,
                                     by="gene")
# specify mart
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org")
ensembl <- useDataset("hsapiens_gene_ensembl", mart=ensembl)
# Get entrez ids for ensembl genes expressed in the project
attributes <- c("ensembl_gene_id", "entrezgene_id", "hgnc_symbol", "description")
annot <- getBM(attributes = attributes, values = ids, mart = ensembl)
annot <- annot[!duplicated(annot$ensembl_gene_id),]
# Remove records where entrez gene ids are unknown
annot <- annot[-which(is.na(annot$entrezgene_id)),]
dim(annot) # 22523 genes in total

# Extract corresponding genes from transcriptByGene GRanges list
transcriptsByGene <- subset(transcriptsByGene, names(transcriptsByGene) %in% as.character(annot$entrezgene_id))
# retained 19040 genes

# Keep only the longest (canonical) transcript
# Create named list where names are entrez ids and values are the names of the longest transcripts
longest_transcript <- list()
# iterate over the list (this takes a few minutes)
for (i in 1:length(names(transcriptsByGene))) {
  # get longest width
  largest_width <- max(width(transcriptsByGene[i]))
  largest_index <- ( which(width(transcriptsByGene[i]) == largest_width) )
  largest_index <- largest_index[[1]][1]
  largest_index <-  as.numeric(largest_index[[1]])
  gene <- names(transcriptsByGene[i])
  longest_transcript[[gene]] <- mcols(transcriptsByGene[[i]][largest_index])$tx_name
}

# convert to vector
longest_transcript <- unlist(longest_transcript)
length(longest_transcript)

# Extract transcript sequences
exonsByGene <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by = "tx")
tx_ids <- names(exonsByGene)
all_transcript_ids <- select(TxDb.Hsapiens.UCSC.hg19.knownGene, keys=tx_ids, columns="TXNAME", keytype="TXID")
select_exon_sets <- subset(exonsByGene, all_transcript_ids$TXNAME %in% longest_transcript)
transcript_sequences <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19, transcripts=select_exon_sets)
# change TX ids to TX names
select_transcript_ids <- all_transcript_ids[names(transcript_sequences),]
names(transcript_sequences) <- select_transcript_ids$TXNAME

# Write out longest sequences as fasta, the ids will UCSC transcript ids
writeXStringSet(transcript_sequences, filepath="longest_transcripts.fasta", format='fasta')
	library("GenomicFeatures")
	library("DESeq2")
	library("TxDb.Hsapiens.UCSC.hg19.knownGene")
	library("BSgenome.Hsapiens.UCSC.hg19")
	library("biomaRt")

	setwd("<working_dir>")

	# Assuming we are working with DESeq2 projects
	# get all genes expressed in the project
	load("DESeq2_object.RData")
	counts <- counts(dds)
	dim(counts) # 35087 genes
	# Get ensembl ids for all of the genes expressed in the project
	ids <- rownames(counts)

	TxDb.Hsapiens.UCSC.hg19.knownGene
	# Get all transcripts per genes using GenomicFeatures
	transcriptsByGene <- transcriptsBy(TxDb.Hsapiens.UCSC.hg19.knownGene,
	by="gene")
	# specify mart
	ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org")
	ensembl <- useDataset("hsapiens_gene_ensembl", mart=ensembl)
	# Get entrez ids for ensembl genes expressed in the project
	attributes <- c("ensembl_gene_id", "entrezgene_id", "hgnc_symbol", "description")
	annot <- getBM(attributes = attributes, values = ids, mart = ensembl)
	annot <- annot[!duplicated(annot$ensembl_gene_id),]
	# Remove records where entrez gene ids are unknown
	annot <- annot[-which(is.na(annot$entrezgene_id)),]
	dim(annot) # 22523 genes in total

	# Extract corresponding genes from transcriptByGene GRanges list
	transcriptsByGene <- subset(transcriptsByGene, names(transcriptsByGene) %in% as.character(annot$entrezgene_id))
	# retained 19040 genes

	# Keep only the longest (canonical) transcript
	# Create named list where names are entrez ids and values are the names of the longest transcripts
	longest_transcript <- list()
	# iterate over the list (this takes a few minutes)
	for (i in 1:length(names(transcriptsByGene))) {
	# get longest width
	largest_width <- max(width(transcriptsByGene[i]))
	largest_index <- ( which(width(transcriptsByGene[i]) == largest_width) )
	largest_index <- largest_index[[1]][1]
	largest_index <- as.numeric(largest_index[[1]])
	gene <- names(transcriptsByGene[i])
	longest_transcript[[gene]] <- mcols(transcriptsByGene[[i]][largest_index])$tx_name
	}

	# convert to vector
	longest_transcript <- unlist(longest_transcript)
	length(longest_transcript)

	# Extract transcript sequences
	exonsByGene <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by = "tx")
	tx_ids <- names(exonsByGene)
	all_transcript_ids <- select(TxDb.Hsapiens.UCSC.hg19.knownGene, keys=tx_ids, columns="TXNAME", keytype="TXID")
	select_exon_sets <- subset(exonsByGene, all_transcript_ids$TXNAME %in% longest_transcript)
	transcript_sequences <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19, transcripts=select_exon_sets)
	# change TX ids to TX names
	select_transcript_ids <- all_transcript_ids[names(transcript_sequences),]
	names(transcript_sequences) <- select_transcript_ids$TXNAME

	# Write out longest sequences as fasta, the ids will UCSC transcript ids
	writeXStringSet(transcript_sequences, filepath="longest_transcripts.fasta", format='fasta')