davetang/biomart_refseq.R

## biomart_refseq.R
#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt")

#load library
library("biomaRt")

#use ensembl mart
ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")

#store the filters
filters <- listFilters(ensembl)

#look for filters with the string refseq
grep('refseq', filters$name, ignore.case=T, value=T)
#[1] "with_refseq_peptide"            "with_refseq_peptide_predicted"  "with_ox_refseq_mrna"
#[4] "with_ox_refseq_mrna_predicted"  "with_ox_refseq_ncrna"           "with_ox_refseq_ncrna_predicted"
#[7] "refseq_mrna"                    "refseq_mrna_predicted"          "refseq_ncrna"
#[10] "refseq_ncrna_predicted"         "refseq_peptide"                 "refseq_peptide_predicted"

#store the attributes we can fetch
attributes <- listAttributes(ensembl)

#check out the first 10 attributes
head(attributes,10)
#                    name           description
#1        ensembl_gene_id       Ensembl Gene ID
#2  ensembl_transcript_id Ensembl Transcript ID
#3     ensembl_peptide_id    Ensembl Protein ID
#4        ensembl_exon_id       Ensembl Exon ID
#5            description           Description
#6        chromosome_name       Chromosome Name
#7         start_position       Gene Start (bp)
#8           end_position         Gene End (bp)
#9                 strand                Strand
#10                  band                  Band

#create vector of chromosomes
my_chr <- c(1:22,'X','Y')

#fetch refseqs
my_refseq <- getBM(attributes='refseq_mrna',
                   filters = 'chromosome_name',
                   values = my_chr,
                   mart = ensembl)

#how many entries 2013 November 28th
length(my_refseq)
[1] 35423

#check out the first few
head(my_refseq)
[1] "NM_001084392" "NM_001355"    "NR_036221"    "NM_138450"    "NM_022840"
[6] "NM_173505"

#I only want entries starting with a NM (curated mRNA)
my_refseq <- my_refseq[grep(pattern="^NM",x=my_refseq,perl=T)]
length(my_refseq)
[1] 33584

#check to see if only NM
head(my_refseq)
[1] "NM_001084392" "NM_001355"    "NM_138450"    "NM_022840"    "NM_173505"
[6] "NM_033453"

#build attribute vector
my_attribute <- c('refseq_mrna',
                  'chromosome_name',
                  'transcript_start',
                  'transcript_end',
                  'strand')

#fetch refseqs and their chromosomal locations
my_refseq_loci <- getBM(attributes=my_attribute,
                        filters = c('refseq_mrna', 'chromosome_name'),
                        values = list(refseq_mrna=my_refseq, chromosome_name=my_chr),
                        mart = ensembl)

dim(my_refseq_loci)
#[1] 33657     5

#how many refseq ids are listed in multiple places?
table(duplicated(my_refseq_loci$refseq_mrna))

#FALSE  TRUE
#33584    73

#get rid of the duplicated entry
my_refseq_loci <- my_refseq_loci[!duplicated(my_refseq_loci$refseq_mrna),]
dim(my_refseq_loci)
#[1] 33584     5

#convert the strand into '-' and '+'
my_refseq_loci$strand <- gsub(pattern='-1', replacement='-', my_refseq_loci$strand)
my_refseq_loci$strand <- gsub(pattern='1', replacement='+', my_refseq_loci$strand)

#add a 'chr' into the chromosome_name
my_refseq_loci$chromosome_name <- gsub(pattern="^",
                                       replacement='chr',
                                       my_refseq_loci$chromosome_name)

#we now have a data frame of all human mRNA refseqs and their chromosomal locations
head(my_refseq_loci)
#   refseq_mrna chromosome_name transcript_start transcript_end strand
#1 NM_001084392           chr22         24313554       24316773      -
#2    NM_001355           chr22         24313554       24322019      -
#3    NM_138450           chr13         50202435       50208008      +
#4    NM_022840           chr18          2538452        2571485      -
#5    NM_033453           chr20          3190006        3204516      +
#6 NM_001267623           chr20          3190171        3204516      +

#I want locations of the region spanning the start of a refSeq
span <- 2

#store as another object
my_refseq_tss <- my_refseq_loci

#positive strand
#adjust the end position first, because we need the start position in our calculations
my_refseq_tss[my_refseq_tss$strand=='+','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']+span
my_refseq_tss[my_refseq_tss$strand=='+','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']-span

#negative strand
my_refseq_tss[my_refseq_tss$strand=='-','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']-span
my_refseq_tss[my_refseq_tss$strand=='-','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']+span

head(my_refseq_tss)
#   refseq_mrna chromosome_name transcript_start transcript_end strand
#1 NM_001084392           chr22         24316771       24316775      -
#2    NM_001355           chr22         24322017       24322021      -
#3    NM_138450           chr13         50202433       50202437      +
#4    NM_022840           chr18          2571483        2571487      -
#5    NM_033453           chr20          3190004        3190008      +
#6 NM_001267623           chr20          3190169        3190173      +
	#install if necessary
	source("http://bioconductor.org/biocLite.R")
	biocLite("biomaRt")

	#load library
	library("biomaRt")

	#use ensembl mart
	ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")

	#store the filters
	filters <- listFilters(ensembl)

	#look for filters with the string refseq
	grep('refseq', filters$name, ignore.case=T, value=T)
	#[1] "with_refseq_peptide" "with_refseq_peptide_predicted" "with_ox_refseq_mrna"
	#[4] "with_ox_refseq_mrna_predicted" "with_ox_refseq_ncrna" "with_ox_refseq_ncrna_predicted"
	#[7] "refseq_mrna" "refseq_mrna_predicted" "refseq_ncrna"
	#[10] "refseq_ncrna_predicted" "refseq_peptide" "refseq_peptide_predicted"

	#store the attributes we can fetch
	attributes <- listAttributes(ensembl)

	#check out the first 10 attributes
	head(attributes,10)
	# name description
	#1 ensembl_gene_id Ensembl Gene ID
	#2 ensembl_transcript_id Ensembl Transcript ID
	#3 ensembl_peptide_id Ensembl Protein ID
	#4 ensembl_exon_id Ensembl Exon ID
	#5 description Description
	#6 chromosome_name Chromosome Name
	#7 start_position Gene Start (bp)
	#8 end_position Gene End (bp)
	#9 strand Strand
	#10 band Band

	#create vector of chromosomes
	my_chr <- c(1:22,'X','Y')

	#fetch refseqs
	my_refseq <- getBM(attributes='refseq_mrna',
	filters = 'chromosome_name',
	values = my_chr,
	mart = ensembl)

	#how many entries 2013 November 28th
	length(my_refseq)
	[1] 35423

	#check out the first few
	head(my_refseq)
	[1] "NM_001084392" "NM_001355" "NR_036221" "NM_138450" "NM_022840"
	[6] "NM_173505"

	#I only want entries starting with a NM (curated mRNA)
	my_refseq <- my_refseq[grep(pattern="^NM",x=my_refseq,perl=T)]
	length(my_refseq)
	[1] 33584

	#check to see if only NM
	head(my_refseq)
	[1] "NM_001084392" "NM_001355" "NM_138450" "NM_022840" "NM_173505"
	[6] "NM_033453"

	#build attribute vector
	my_attribute <- c('refseq_mrna',
	'chromosome_name',
	'transcript_start',
	'transcript_end',
	'strand')

	#fetch refseqs and their chromosomal locations
	my_refseq_loci <- getBM(attributes=my_attribute,
	filters = c('refseq_mrna', 'chromosome_name'),
	values = list(refseq_mrna=my_refseq, chromosome_name=my_chr),
	mart = ensembl)

	dim(my_refseq_loci)
	#[1] 33657 5

	#how many refseq ids are listed in multiple places?
	table(duplicated(my_refseq_loci$refseq_mrna))

	#FALSE TRUE
	#33584 73

	#get rid of the duplicated entry
	my_refseq_loci <- my_refseq_loci[!duplicated(my_refseq_loci$refseq_mrna),]
	dim(my_refseq_loci)
	#[1] 33584 5

	#convert the strand into '-' and '+'
	my_refseq_loci$strand <- gsub(pattern='-1', replacement='-', my_refseq_loci$strand)
	my_refseq_loci$strand <- gsub(pattern='1', replacement='+', my_refseq_loci$strand)

	#add a 'chr' into the chromosome_name
	my_refseq_loci$chromosome_name <- gsub(pattern="^",
	replacement='chr',
	my_refseq_loci$chromosome_name)

	#we now have a data frame of all human mRNA refseqs and their chromosomal locations
	head(my_refseq_loci)
	# refseq_mrna chromosome_name transcript_start transcript_end strand
	#1 NM_001084392 chr22 24313554 24316773 -
	#2 NM_001355 chr22 24313554 24322019 -
	#3 NM_138450 chr13 50202435 50208008 +
	#4 NM_022840 chr18 2538452 2571485 -
	#5 NM_033453 chr20 3190006 3204516 +
	#6 NM_001267623 chr20 3190171 3204516 +

	#I want locations of the region spanning the start of a refSeq
	span <- 2

	#store as another object
	my_refseq_tss <- my_refseq_loci

	#positive strand
	#adjust the end position first, because we need the start position in our calculations
	my_refseq_tss[my_refseq_tss$strand=='+','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']+span
	my_refseq_tss[my_refseq_tss$strand=='+','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']-span

	#negative strand
	my_refseq_tss[my_refseq_tss$strand=='-','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']-span
	my_refseq_tss[my_refseq_tss$strand=='-','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']+span

	head(my_refseq_tss)
	# refseq_mrna chromosome_name transcript_start transcript_end strand
	#1 NM_001084392 chr22 24316771 24316775 -
	#2 NM_001355 chr22 24322017 24322021 -
	#3 NM_138450 chr13 50202433 50202437 +
	#4 NM_022840 chr18 2571483 2571487 -
	#5 NM_033453 chr20 3190004 3190008 +
	#6 NM_001267623 chr20 3190169 3190173 +