Skip to content

Instantly share code, notes, and snippets.

@davetang
Last active May 12, 2022 06:55
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save davetang/6562391 to your computer and use it in GitHub Desktop.
Save davetang/6562391 to your computer and use it in GitHub Desktop.
Using biomaRt to fetch all human mRNA refSeqs and their corresponding chromosome coordinates
#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt")
#load library
library("biomaRt")
#use ensembl mart
ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")
#store the filters
filters <- listFilters(ensembl)
#look for filters with the string refseq
grep('refseq', filters$name, ignore.case=T, value=T)
#[1] "with_refseq_peptide" "with_refseq_peptide_predicted" "with_ox_refseq_mrna"
#[4] "with_ox_refseq_mrna_predicted" "with_ox_refseq_ncrna" "with_ox_refseq_ncrna_predicted"
#[7] "refseq_mrna" "refseq_mrna_predicted" "refseq_ncrna"
#[10] "refseq_ncrna_predicted" "refseq_peptide" "refseq_peptide_predicted"
#store the attributes we can fetch
attributes <- listAttributes(ensembl)
#check out the first 10 attributes
head(attributes,10)
# name description
#1 ensembl_gene_id Ensembl Gene ID
#2 ensembl_transcript_id Ensembl Transcript ID
#3 ensembl_peptide_id Ensembl Protein ID
#4 ensembl_exon_id Ensembl Exon ID
#5 description Description
#6 chromosome_name Chromosome Name
#7 start_position Gene Start (bp)
#8 end_position Gene End (bp)
#9 strand Strand
#10 band Band
#create vector of chromosomes
my_chr <- c(1:22,'X','Y')
#fetch refseqs
my_refseq <- getBM(attributes='refseq_mrna',
filters = 'chromosome_name',
values = my_chr,
mart = ensembl)
#how many entries 2013 November 28th
length(my_refseq)
[1] 35423
#check out the first few
head(my_refseq)
[1] "NM_001084392" "NM_001355" "NR_036221" "NM_138450" "NM_022840"
[6] "NM_173505"
#I only want entries starting with a NM (curated mRNA)
my_refseq <- my_refseq[grep(pattern="^NM",x=my_refseq,perl=T)]
length(my_refseq)
[1] 33584
#check to see if only NM
head(my_refseq)
[1] "NM_001084392" "NM_001355" "NM_138450" "NM_022840" "NM_173505"
[6] "NM_033453"
#build attribute vector
my_attribute <- c('refseq_mrna',
'chromosome_name',
'transcript_start',
'transcript_end',
'strand')
#fetch refseqs and their chromosomal locations
my_refseq_loci <- getBM(attributes=my_attribute,
filters = c('refseq_mrna', 'chromosome_name'),
values = list(refseq_mrna=my_refseq, chromosome_name=my_chr),
mart = ensembl)
dim(my_refseq_loci)
#[1] 33657 5
#how many refseq ids are listed in multiple places?
table(duplicated(my_refseq_loci$refseq_mrna))
#FALSE TRUE
#33584 73
#get rid of the duplicated entry
my_refseq_loci <- my_refseq_loci[!duplicated(my_refseq_loci$refseq_mrna),]
dim(my_refseq_loci)
#[1] 33584 5
#convert the strand into '-' and '+'
my_refseq_loci$strand <- gsub(pattern='-1', replacement='-', my_refseq_loci$strand)
my_refseq_loci$strand <- gsub(pattern='1', replacement='+', my_refseq_loci$strand)
#add a 'chr' into the chromosome_name
my_refseq_loci$chromosome_name <- gsub(pattern="^",
replacement='chr',
my_refseq_loci$chromosome_name)
#we now have a data frame of all human mRNA refseqs and their chromosomal locations
head(my_refseq_loci)
# refseq_mrna chromosome_name transcript_start transcript_end strand
#1 NM_001084392 chr22 24313554 24316773 -
#2 NM_001355 chr22 24313554 24322019 -
#3 NM_138450 chr13 50202435 50208008 +
#4 NM_022840 chr18 2538452 2571485 -
#5 NM_033453 chr20 3190006 3204516 +
#6 NM_001267623 chr20 3190171 3204516 +
#I want locations of the region spanning the start of a refSeq
span <- 2
#store as another object
my_refseq_tss <- my_refseq_loci
#positive strand
#adjust the end position first, because we need the start position in our calculations
my_refseq_tss[my_refseq_tss$strand=='+','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']+span
my_refseq_tss[my_refseq_tss$strand=='+','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']-span
#negative strand
my_refseq_tss[my_refseq_tss$strand=='-','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']-span
my_refseq_tss[my_refseq_tss$strand=='-','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']+span
head(my_refseq_tss)
# refseq_mrna chromosome_name transcript_start transcript_end strand
#1 NM_001084392 chr22 24316771 24316775 -
#2 NM_001355 chr22 24322017 24322021 -
#3 NM_138450 chr13 50202433 50202437 +
#4 NM_022840 chr18 2571483 2571487 -
#5 NM_033453 chr20 3190004 3190008 +
#6 NM_001267623 chr20 3190169 3190173 +
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment