Skip to content

Instantly share code, notes, and snippets.

@slavailn
Created March 2, 2020 21:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save slavailn/5df79cf89754c0a37b4255a1aa35f5d3 to your computer and use it in GitHub Desktop.
Save slavailn/5df79cf89754c0a37b4255a1aa35f5d3 to your computer and use it in GitHub Desktop.
Get longest transcript for a set of select genes and write them out as fasta
library("GenomicFeatures")
library("DESeq2")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("BSgenome.Hsapiens.UCSC.hg19")
library("biomaRt")
setwd("<working_dir>")
# Assuming we are working with DESeq2 projects
# get all genes expressed in the project
load("DESeq2_object.RData")
counts <- counts(dds)
dim(counts) # 35087 genes
# Get ensembl ids for all of the genes expressed in the project
ids <- rownames(counts)
TxDb.Hsapiens.UCSC.hg19.knownGene
# Get all transcripts per genes using GenomicFeatures
transcriptsByGene <- transcriptsBy(TxDb.Hsapiens.UCSC.hg19.knownGene,
by="gene")
# specify mart
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org")
ensembl <- useDataset("hsapiens_gene_ensembl", mart=ensembl)
# Get entrez ids for ensembl genes expressed in the project
attributes <- c("ensembl_gene_id", "entrezgene_id", "hgnc_symbol", "description")
annot <- getBM(attributes = attributes, values = ids, mart = ensembl)
annot <- annot[!duplicated(annot$ensembl_gene_id),]
# Remove records where entrez gene ids are unknown
annot <- annot[-which(is.na(annot$entrezgene_id)),]
dim(annot) # 22523 genes in total
# Extract corresponding genes from transcriptByGene GRanges list
transcriptsByGene <- subset(transcriptsByGene, names(transcriptsByGene) %in% as.character(annot$entrezgene_id))
# retained 19040 genes
# Keep only the longest (canonical) transcript
# Create named list where names are entrez ids and values are the names of the longest transcripts
longest_transcript <- list()
# iterate over the list (this takes a few minutes)
for (i in 1:length(names(transcriptsByGene))) {
# get longest width
largest_width <- max(width(transcriptsByGene[i]))
largest_index <- ( which(width(transcriptsByGene[i]) == largest_width) )
largest_index <- largest_index[[1]][1]
largest_index <- as.numeric(largest_index[[1]])
gene <- names(transcriptsByGene[i])
longest_transcript[[gene]] <- mcols(transcriptsByGene[[i]][largest_index])$tx_name
}
# convert to vector
longest_transcript <- unlist(longest_transcript)
length(longest_transcript)
# Extract transcript sequences
exonsByGene <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by = "tx")
tx_ids <- names(exonsByGene)
all_transcript_ids <- select(TxDb.Hsapiens.UCSC.hg19.knownGene, keys=tx_ids, columns="TXNAME", keytype="TXID")
select_exon_sets <- subset(exonsByGene, all_transcript_ids$TXNAME %in% longest_transcript)
transcript_sequences <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19, transcripts=select_exon_sets)
# change TX ids to TX names
select_transcript_ids <- all_transcript_ids[names(transcript_sequences),]
names(transcript_sequences) <- select_transcript_ids$TXNAME
# Write out longest sequences as fasta, the ids will UCSC transcript ids
writeXStringSet(transcript_sequences, filepath="longest_transcripts.fasta", format='fasta')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment