Created
March 15, 2015 02:41
-
-
Save TATABOX42/6414de529ac90f32be41 to your computer and use it in GitHub Desktop.
Code used for post: http://btovar.com/2015/03/extracting-upstream-regions/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ****************************************************************************** | |
# FUNCTION: extract.five.utr.sequence | JULY/29/2012 | BENJAMIN TOVAR | |
# ****************************************************************************** | |
extract.five.utr.sequence <- function(gene.list.refseq, | |
number.bases.upstream=1000){ | |
# Author: Benjamin Tovar | |
# Date: July 29, 2012 | |
# Post: http://btovar.com/2015/03/extracting-upstream-regions/ | |
# ***************************************************************** | |
# RUNNING NOTES: Please download this packages from Bioconductor | |
# http://www.bioconductor.org/packages/release/data/annotation/ | |
# ***************************************************************** | |
# source("http://bioconductor.org/biocLite.R") | |
# biocLite("BSgenome.Hsapiens.UCSC.hg19") | |
# biocLite("Biostrings") | |
# biocLite("org.Hs.eg.db") | |
# ***************************************************************** | |
# RUNNING EXAMPLE | |
# ***************************************************************** | |
# | |
## Extract 250 bases upstream of each gene in gene.list.refseq | |
# | |
# gene.list.refseq <- c("NM_003588","NM_001145436", "NM_001135188","NM_020760","NM_173362", | |
# "NM_198393","NM_022736","NM_025074","NM_033449","NM_015726", | |
# "NM_022110","NM_016478","NM_020634","NM_002291","NM_000418", | |
# "NM_001862","NM_017752","NM_006591","NM_000124","NM_144610") | |
# | |
# bases.upstream <- 250 | |
# | |
# result <- extract.five.utr.sequence(gene.list.refseq,bases.upstream) | |
# ***************************************************************** | |
# LOAD THE LIBRARIES | |
# ***************************************************************** | |
cat("Loading libraries",date(),"\n") | |
# human genome DNA sequences | |
library(BSgenome.Hsapiens.UCSC.hg19) | |
# human genome wide annotations | |
library(org.Hs.eg.db) | |
# load IDs, symbol and gene descriptions | |
refseq.id <- toTable(org.Hs.egREFSEQ) | |
symbol <- toTable(org.Hs.egSYMBOL) | |
gene.description <- toTable(org.Hs.egGENENAME) | |
# ***************************************************************** | |
# LOAD THE UPSTREAM SEQUENCES | |
# ***************************************************************** | |
if(number.bases.upstream <= 1000){ | |
# load the 1000 bases upstream of all genes in the human genome | |
# in the BSgenome.Hsapiens.UCSC.hg19 package | |
upstream <- Hsapiens$upstream1000 | |
} | |
if(number.bases.upstream > 1000 && number.bases.upstream <= 2000){ | |
# load the 1000 bases upstream of all genes in the human genome | |
# in the BSgenome.Hsapiens.UCSC.hg19 package | |
upstream <- Hsapiens$upstream2000 | |
} | |
if(number.bases.upstream > 2000 && number.bases.upstream <= 5000){ | |
# load the 1000 bases upstream of all genes in the human genome | |
# in the BSgenome.Hsapiens.UCSC.hg19 package | |
upstream <- Hsapiens$upstream5000 | |
} | |
if(number.bases.upstream > 5000){ | |
cat("ERROR: The number of bases upstream of 5'UTR is too large (max value = 5000)\n") | |
return(0); | |
} | |
if(number.bases.upstream < 1){ | |
cat("ERROR: The number of bases upstream of 5'UTR cannot be less than 1 nucleotide (min value = 1)\n") | |
return(0); | |
} | |
# extract the names of each gene | |
names.genes.upstream <- names(upstream) | |
# extract the refseq of each gene | |
refseq.upstream.id <- vector("character",length(names.genes.upstream)) | |
for(i in 1:length(names.genes.upstream)){ | |
refseq.upstream.id[i] <- gsub(", ","_",toString(unlist(strsplit(names.genes.upstream[i], | |
"\\_"))[c(1,2)],sep="")) | |
} | |
names(refseq.upstream.id) <- 1:length(refseq.upstream.id) | |
# ***************************************************************** | |
# CREATE OUTPUT OBJECT | |
# ***************************************************************** | |
output.params <- c("entrez.id","symbol","gene.description","five.UTR.sequence") | |
output.list <- list() | |
for(i in 1:length(gene.list.refseq)){ | |
output.list[[i]] <- list() | |
for(j in 1:length(output.params)){ | |
output.list[[i]][[j]] <- list() | |
} | |
names(output.list[[i]]) <- output.params | |
} | |
names(output.list) <- gene.list.refseq | |
# ***************************************************************** | |
# LET THE COMPUTER DECIDE | |
# ***************************************************************** | |
cat("Extracting data",date(),"\n") | |
for(i in 1:length(gene.list.refseq)){ | |
cat(rep(".",1)) | |
# extract the entrez.id of each gene in geneList | |
output.list[[i]]$entrez.id <- refseq.id[which(refseq.id$accession==gene.list.refseq[i]),]$gene_id | |
# extract the symbol of each gene in geneList | |
output.list[[i]]$symbol <- symbol[which(symbol==output.list[[i]]$entrez.id),]$symbol | |
# extract the gene description of each gene in geneList | |
output.list[[i]]$gene.description <- gene.description[which(gene.description==output.list[[i]]$entrez.id),]$gene_name | |
# Return the index of the target genes | |
index.target.gene <- sequence <- NA; | |
index.target.gene <- as.numeric(names(refseq.upstream.id[which(refseq.upstream.id==gene.list.refseq[i])])) | |
# NOTE: some genes have repeated RefSeq.id which corresponds to the same gene in other Chromosomes. | |
# by using index.target.gene[1] we are selecting only the first RefSeq entry. | |
sequence <- upstream[index.target.gene[1]] | |
sequence.length <- nchar(sequence) | |
start.position <- ((sequence.length-number.bases.upstream)+1) | |
sequence <- substring(toString(sequence),1:sequence.length,1:sequence.length) | |
sequence <- sequence[start.position:sequence.length] | |
output.list[[i]]$five.UTR.sequence <- sequence | |
} | |
cat("\n") | |
cat("Computations DONE",date(),"\n") | |
return(output.list) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment