Created
March 2, 2020 21:46
-
-
Save slavailn/5df79cf89754c0a37b4255a1aa35f5d3 to your computer and use it in GitHub Desktop.
Get longest transcript for a set of select genes and write them out as fasta
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("GenomicFeatures") | |
library("DESeq2") | |
library("TxDb.Hsapiens.UCSC.hg19.knownGene") | |
library("BSgenome.Hsapiens.UCSC.hg19") | |
library("biomaRt") | |
setwd("<working_dir>") | |
# Assuming we are working with DESeq2 projects | |
# get all genes expressed in the project | |
load("DESeq2_object.RData") | |
counts <- counts(dds) | |
dim(counts) # 35087 genes | |
# Get ensembl ids for all of the genes expressed in the project | |
ids <- rownames(counts) | |
TxDb.Hsapiens.UCSC.hg19.knownGene | |
# Get all transcripts per genes using GenomicFeatures | |
transcriptsByGene <- transcriptsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, | |
by="gene") | |
# specify mart | |
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org") | |
ensembl <- useDataset("hsapiens_gene_ensembl", mart=ensembl) | |
# Get entrez ids for ensembl genes expressed in the project | |
attributes <- c("ensembl_gene_id", "entrezgene_id", "hgnc_symbol", "description") | |
annot <- getBM(attributes = attributes, values = ids, mart = ensembl) | |
annot <- annot[!duplicated(annot$ensembl_gene_id),] | |
# Remove records where entrez gene ids are unknown | |
annot <- annot[-which(is.na(annot$entrezgene_id)),] | |
dim(annot) # 22523 genes in total | |
# Extract corresponding genes from transcriptByGene GRanges list | |
transcriptsByGene <- subset(transcriptsByGene, names(transcriptsByGene) %in% as.character(annot$entrezgene_id)) | |
# retained 19040 genes | |
# Keep only the longest (canonical) transcript | |
# Create named list where names are entrez ids and values are the names of the longest transcripts | |
longest_transcript <- list() | |
# iterate over the list (this takes a few minutes) | |
for (i in 1:length(names(transcriptsByGene))) { | |
# get longest width | |
largest_width <- max(width(transcriptsByGene[i])) | |
largest_index <- ( which(width(transcriptsByGene[i]) == largest_width) ) | |
largest_index <- largest_index[[1]][1] | |
largest_index <- as.numeric(largest_index[[1]]) | |
gene <- names(transcriptsByGene[i]) | |
longest_transcript[[gene]] <- mcols(transcriptsByGene[[i]][largest_index])$tx_name | |
} | |
# convert to vector | |
longest_transcript <- unlist(longest_transcript) | |
length(longest_transcript) | |
# Extract transcript sequences | |
exonsByGene <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by = "tx") | |
tx_ids <- names(exonsByGene) | |
all_transcript_ids <- select(TxDb.Hsapiens.UCSC.hg19.knownGene, keys=tx_ids, columns="TXNAME", keytype="TXID") | |
select_exon_sets <- subset(exonsByGene, all_transcript_ids$TXNAME %in% longest_transcript) | |
transcript_sequences <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19, transcripts=select_exon_sets) | |
# change TX ids to TX names | |
select_transcript_ids <- all_transcript_ids[names(transcript_sequences),] | |
names(transcript_sequences) <- select_transcript_ids$TXNAME | |
# Write out longest sequences as fasta, the ids will UCSC transcript ids | |
writeXStringSet(transcript_sequences, filepath="longest_transcripts.fasta", format='fasta') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment