TATABOX42/extract_utr.R

## extract_utr.R
# ******************************************************************************
#       FUNCTION: extract.five.utr.sequence | JULY/29/2012 | BENJAMIN TOVAR
# ******************************************************************************

extract.five.utr.sequence <- function(gene.list.refseq,
                                      number.bases.upstream=1000){

    # Author: Benjamin Tovar
    # Date: July 29, 2012
    # Post: http://btovar.com/2015/03/extracting-upstream-regions/

    # *****************************************************************
    # RUNNING NOTES: Please download this packages from Bioconductor
    # http://www.bioconductor.org/packages/release/data/annotation/
    # *****************************************************************
    #   source("http://bioconductor.org/biocLite.R")
    #   biocLite("BSgenome.Hsapiens.UCSC.hg19")
    #   biocLite("Biostrings")
    #   biocLite("org.Hs.eg.db")

    # *****************************************************************
    # RUNNING EXAMPLE
    # *****************************************************************
    #
    ## Extract 250 bases upstream of each gene in gene.list.refseq
    #
    # gene.list.refseq <- c("NM_003588","NM_001145436", "NM_001135188","NM_020760","NM_173362",
    #                    "NM_198393","NM_022736","NM_025074","NM_033449","NM_015726",
    #                    "NM_022110","NM_016478","NM_020634","NM_002291","NM_000418",
    #                    "NM_001862","NM_017752","NM_006591","NM_000124","NM_144610")
    #
    # bases.upstream <- 250
    #
    # result <- extract.five.utr.sequence(gene.list.refseq,bases.upstream)

    # *****************************************************************
    # LOAD THE LIBRARIES
    # *****************************************************************
    cat("Loading libraries",date(),"\n")
    # human genome DNA sequences
        library(BSgenome.Hsapiens.UCSC.hg19)
    # human genome wide annotations
        library(org.Hs.eg.db)
    # load IDs, symbol and gene descriptions
        refseq.id <- toTable(org.Hs.egREFSEQ)
        symbol <- toTable(org.Hs.egSYMBOL)
        gene.description <- toTable(org.Hs.egGENENAME)

    # *****************************************************************
    # LOAD THE UPSTREAM SEQUENCES
    # *****************************************************************

    if(number.bases.upstream <= 1000){
        # load the 1000 bases upstream of all genes in the human genome
        # in the BSgenome.Hsapiens.UCSC.hg19 package
        upstream <- Hsapiens$upstream1000
    }

    if(number.bases.upstream > 1000 && number.bases.upstream <= 2000){
        # load the 1000 bases upstream of all genes in the human genome
        # in the BSgenome.Hsapiens.UCSC.hg19 package
        upstream <- Hsapiens$upstream2000
    }

    if(number.bases.upstream > 2000 && number.bases.upstream <= 5000){
        # load the 1000 bases upstream of all genes in the human genome
        # in the BSgenome.Hsapiens.UCSC.hg19 package
        upstream <- Hsapiens$upstream5000
    }

    if(number.bases.upstream > 5000){
        cat("ERROR: The number of bases upstream of 5'UTR is too large (max value = 5000)\n")
        return(0);
    }

    if(number.bases.upstream < 1){
        cat("ERROR: The number of bases upstream of 5'UTR cannot be less than 1 nucleotide (min value = 1)\n")
        return(0);
    }

    # extract the names of each gene
    names.genes.upstream <- names(upstream)

    # extract the refseq of each gene
    refseq.upstream.id <- vector("character",length(names.genes.upstream))
    for(i in 1:length(names.genes.upstream)){
        refseq.upstream.id[i] <- gsub(", ","_",toString(unlist(strsplit(names.genes.upstream[i],
                                      "\\_"))[c(1,2)],sep=""))
    }
    names(refseq.upstream.id) <- 1:length(refseq.upstream.id)

    # *****************************************************************
    # CREATE OUTPUT OBJECT
    # *****************************************************************
    output.params <- c("entrez.id","symbol","gene.description","five.UTR.sequence")

    output.list <- list()
    for(i in 1:length(gene.list.refseq)){
        output.list[[i]] <- list()
        for(j in 1:length(output.params)){
            output.list[[i]][[j]] <- list()
        }
        names(output.list[[i]]) <- output.params
    }
    names(output.list) <- gene.list.refseq

    # *****************************************************************
    # LET THE COMPUTER DECIDE
    # *****************************************************************
    cat("Extracting data",date(),"\n")

    for(i in 1:length(gene.list.refseq)){
        cat(rep(".",1))

        # extract the entrez.id of each gene in geneList
        output.list[[i]]$entrez.id <- refseq.id[which(refseq.id$accession==gene.list.refseq[i]),]$gene_id

        # extract the symbol of each gene in geneList
        output.list[[i]]$symbol <- symbol[which(symbol==output.list[[i]]$entrez.id),]$symbol

        # extract the gene description of each gene in geneList
        output.list[[i]]$gene.description <- gene.description[which(gene.description==output.list[[i]]$entrez.id),]$gene_name

        # Return the index of the target genes
        index.target.gene <- sequence <- NA;
        index.target.gene <- as.numeric(names(refseq.upstream.id[which(refseq.upstream.id==gene.list.refseq[i])]))

        # NOTE: some genes have repeated RefSeq.id which corresponds to the same gene in other Chromosomes.
        # by using index.target.gene[1] we are selecting only the first RefSeq entry.
        sequence <- upstream[index.target.gene[1]]
        sequence.length <- nchar(sequence)
        start.position <- ((sequence.length-number.bases.upstream)+1)
        sequence <- substring(toString(sequence),1:sequence.length,1:sequence.length)
        sequence <- sequence[start.position:sequence.length]
        output.list[[i]]$five.UTR.sequence <- sequence
    }
    cat("\n")
    cat("Computations DONE",date(),"\n")
    return(output.list)
}
	# ******************************************************************************
	# FUNCTION: extract.five.utr.sequence \| JULY/29/2012 \| BENJAMIN TOVAR
	# ******************************************************************************

	extract.five.utr.sequence <- function(gene.list.refseq,
	number.bases.upstream=1000){

	# Author: Benjamin Tovar
	# Date: July 29, 2012
	# Post: http://btovar.com/2015/03/extracting-upstream-regions/

	# *****************************************************************
	# RUNNING NOTES: Please download this packages from Bioconductor
	# http://www.bioconductor.org/packages/release/data/annotation/
	# *****************************************************************
	# source("http://bioconductor.org/biocLite.R")
	# biocLite("BSgenome.Hsapiens.UCSC.hg19")
	# biocLite("Biostrings")
	# biocLite("org.Hs.eg.db")

	# *****************************************************************
	# RUNNING EXAMPLE
	# *****************************************************************
	#
	## Extract 250 bases upstream of each gene in gene.list.refseq
	#
	# gene.list.refseq <- c("NM_003588","NM_001145436", "NM_001135188","NM_020760","NM_173362",
	# "NM_198393","NM_022736","NM_025074","NM_033449","NM_015726",
	# "NM_022110","NM_016478","NM_020634","NM_002291","NM_000418",
	# "NM_001862","NM_017752","NM_006591","NM_000124","NM_144610")
	#
	# bases.upstream <- 250
	#
	# result <- extract.five.utr.sequence(gene.list.refseq,bases.upstream)

	# *****************************************************************
	# LOAD THE LIBRARIES
	# *****************************************************************
	cat("Loading libraries",date(),"\n")
	# human genome DNA sequences
	library(BSgenome.Hsapiens.UCSC.hg19)
	# human genome wide annotations
	library(org.Hs.eg.db)
	# load IDs, symbol and gene descriptions
	refseq.id <- toTable(org.Hs.egREFSEQ)
	symbol <- toTable(org.Hs.egSYMBOL)
	gene.description <- toTable(org.Hs.egGENENAME)

	# *****************************************************************
	# LOAD THE UPSTREAM SEQUENCES
	# *****************************************************************

	if(number.bases.upstream <= 1000){
	# load the 1000 bases upstream of all genes in the human genome
	# in the BSgenome.Hsapiens.UCSC.hg19 package
	upstream <- Hsapiens$upstream1000
	}

	if(number.bases.upstream > 1000 && number.bases.upstream <= 2000){
	# load the 1000 bases upstream of all genes in the human genome
	# in the BSgenome.Hsapiens.UCSC.hg19 package
	upstream <- Hsapiens$upstream2000
	}

	if(number.bases.upstream > 2000 && number.bases.upstream <= 5000){
	# load the 1000 bases upstream of all genes in the human genome
	# in the BSgenome.Hsapiens.UCSC.hg19 package
	upstream <- Hsapiens$upstream5000
	}

	if(number.bases.upstream > 5000){
	cat("ERROR: The number of bases upstream of 5'UTR is too large (max value = 5000)\n")
	return(0);
	}

	if(number.bases.upstream < 1){
	cat("ERROR: The number of bases upstream of 5'UTR cannot be less than 1 nucleotide (min value = 1)\n")
	return(0);
	}

	# extract the names of each gene
	names.genes.upstream <- names(upstream)

	# extract the refseq of each gene
	refseq.upstream.id <- vector("character",length(names.genes.upstream))
	for(i in 1:length(names.genes.upstream)){
	refseq.upstream.id[i] <- gsub(", ","_",toString(unlist(strsplit(names.genes.upstream[i],
	"\\_"))[c(1,2)],sep=""))
	}
	names(refseq.upstream.id) <- 1:length(refseq.upstream.id)

	# *****************************************************************
	# CREATE OUTPUT OBJECT
	# *****************************************************************
	output.params <- c("entrez.id","symbol","gene.description","five.UTR.sequence")

	output.list <- list()
	for(i in 1:length(gene.list.refseq)){
	output.list[[i]] <- list()
	for(j in 1:length(output.params)){
	output.list[[i]][[j]] <- list()
	}
	names(output.list[[i]]) <- output.params
	}
	names(output.list) <- gene.list.refseq

	# *****************************************************************
	# LET THE COMPUTER DECIDE
	# *****************************************************************
	cat("Extracting data",date(),"\n")

	for(i in 1:length(gene.list.refseq)){
	cat(rep(".",1))

	# extract the entrez.id of each gene in geneList
	output.list[[i]]$entrez.id <- refseq.id[which(refseq.id$accession==gene.list.refseq[i]),]$gene_id

	# extract the symbol of each gene in geneList
	output.list[[i]]$symbol <- symbol[which(symbol==output.list[[i]]$entrez.id),]$symbol

	# extract the gene description of each gene in geneList
	output.list[[i]]$gene.description <- gene.description[which(gene.description==output.list[[i]]$entrez.id),]$gene_name

	# Return the index of the target genes
	index.target.gene <- sequence <- NA;
	index.target.gene <- as.numeric(names(refseq.upstream.id[which(refseq.upstream.id==gene.list.refseq[i])]))

	# NOTE: some genes have repeated RefSeq.id which corresponds to the same gene in other Chromosomes.
	# by using index.target.gene[1] we are selecting only the first RefSeq entry.
	sequence <- upstream[index.target.gene[1]]
	sequence.length <- nchar(sequence)
	start.position <- ((sequence.length-number.bases.upstream)+1)
	sequence <- substring(toString(sequence),1:sequence.length,1:sequence.length)
	sequence <- sequence[start.position:sequence.length]
	output.list[[i]]$five.UTR.sequence <- sequence
	}
	cat("\n")
	cat("Computations DONE",date(),"\n")
	return(output.list)
	}