ccwang002/get_human_3UTR.r

## get_human_3UTR.r
# Code here is adapted from
# http://gettinggeneticsdone.blogspot.tw/2011/04/using-rstats-bioconductor-to-get.html
# If Bioconductor or the required packages (e.g., BSgenome) are not installed,
# One should first run the following code section "Installation"
source("http://www.bioconductor.org/biocLite.R")

# === Installation ===
# --- known gene ---
# biocLite('TxDb.Hsapiens.UCSC.hg19.knownGene')

# --- gene id conversion ---
# biocLite("org.Hs.eg.db")
# biocLite("annotate")

# --- reference sequence ---
# Load the BSgenome package and download the hg19 reference sequence
# For documentation see http://www.bioconductor.org/packages/release/bioc/html/BSgenome.html
# biocLite("BSgenome")
# biocLite("BSgenome.Hsapiens.UCSC.hg19")  #installs the human genome (~850 MB download).

# === Gene Annotation ===
# get all known genes
library(TxDb.Hsapiens.UCSC.hg19.knownGene, quietly = TRUE)
txdb_hg19 <- TxDb.Hsapiens.UCSC.hg19.knownGene

# get 3' UTR info of all known genes
hg19_3UTR <- as.data.frame(threeUTRsByTranscript(txdb_hg19, use.names=TRUE))

# convert gene id into gene symbol name
library(org.Hs.eg.db)
library(annotate)
library(plyr)
names(hg19_3UTR)[[1]] <- 'UCSCKG_ID'  # change 'element' into more meaningful name
# select(...) is a DB access function.
# It selects gene symbol of Human DB based on colomn 'UCSCKG' with keys in 3' UTR gene ID (UCSCKG_ID),
# so it returns a table with three colomns: UCSCKG, SYMBOL, and ENTREZID
gene_id_table <-select(org.Hs.eg.db, hg19_3UTR$UCSCKG_ID,
                       cols=c("SYMBOL", "ENTREZID"),
                       keytype="UCSCKG")
# copy the query table to main 3' UTR table per column
hg19_3UTR$GENE_SYMBOL <- gene_id_table$SYMBOL
hg19_3UTR$ENTREZ_ID <- gene_id_table$ENTREZID
# reorder data frame
hg19_3UTR <- hg19_3UTR[, c(10, 11, 1:9)]

# === Reference Sequence ===
library(BSgenome.Hsapiens.UCSC.hg19)
Hsapiens  # now the genome reference has been loaded
library(plyr)
# Test the function by use part of the 3' UTR data
# partial <- hg19_3UTR[1:10,]
# partial$seq <- getSeq(Hsapiens, partial$seqnames,
#                       start=partial$start, end=partial$end, strand=partial$strand,
#                       as.character=TRUE)
# retrieve all sequence
hg19_3UTR$sequence <- getSeq(Hsapiens, hg19_3UTR$seqnames,
                             start=hg19_3UTR$start, end=hg19_3UTR$end, strand=hg19_3UTR$strand,
                             as.character=TRUE)
# save to csv
write.csv(hg19_3UTR, file='hg19_3UTR_raw.csv', row.names=FALSE)  # file size ~ 80MB
	# Code here is adapted from
	# http://gettinggeneticsdone.blogspot.tw/2011/04/using-rstats-bioconductor-to-get.html
	# If Bioconductor or the required packages (e.g., BSgenome) are not installed,
	# One should first run the following code section "Installation"
	source("http://www.bioconductor.org/biocLite.R")

	# === Installation ===
	# --- known gene ---
	# biocLite('TxDb.Hsapiens.UCSC.hg19.knownGene')

	# --- gene id conversion ---
	# biocLite("org.Hs.eg.db")
	# biocLite("annotate")

	# --- reference sequence ---
	# Load the BSgenome package and download the hg19 reference sequence
	# For documentation see http://www.bioconductor.org/packages/release/bioc/html/BSgenome.html
	# biocLite("BSgenome")
	# biocLite("BSgenome.Hsapiens.UCSC.hg19") #installs the human genome (~850 MB download).

	# === Gene Annotation ===
	# get all known genes
	library(TxDb.Hsapiens.UCSC.hg19.knownGene, quietly = TRUE)
	txdb_hg19 <- TxDb.Hsapiens.UCSC.hg19.knownGene

	# get 3' UTR info of all known genes
	hg19_3UTR <- as.data.frame(threeUTRsByTranscript(txdb_hg19, use.names=TRUE))

	# convert gene id into gene symbol name
	library(org.Hs.eg.db)
	library(annotate)
	library(plyr)
	names(hg19_3UTR)[[1]] <- 'UCSCKG_ID' # change 'element' into more meaningful name
	# select(...) is a DB access function.
	# It selects gene symbol of Human DB based on colomn 'UCSCKG' with keys in 3' UTR gene ID (UCSCKG_ID),
	# so it returns a table with three colomns: UCSCKG, SYMBOL, and ENTREZID
	gene_id_table <-select(org.Hs.eg.db, hg19_3UTR$UCSCKG_ID,
	cols=c("SYMBOL", "ENTREZID"),
	keytype="UCSCKG")
	# copy the query table to main 3' UTR table per column
	hg19_3UTR$GENE_SYMBOL <- gene_id_table$SYMBOL
	hg19_3UTR$ENTREZ_ID <- gene_id_table$ENTREZID
	# reorder data frame
	hg19_3UTR <- hg19_3UTR[, c(10, 11, 1:9)]

	# === Reference Sequence ===
	library(BSgenome.Hsapiens.UCSC.hg19)
	Hsapiens # now the genome reference has been loaded
	library(plyr)
	# Test the function by use part of the 3' UTR data
	# partial <- hg19_3UTR[1:10,]
	# partial$seq <- getSeq(Hsapiens, partial$seqnames,
	# start=partial$start, end=partial$end, strand=partial$strand,
	# as.character=TRUE)
	# retrieve all sequence
	hg19_3UTR$sequence <- getSeq(Hsapiens, hg19_3UTR$seqnames,
	start=hg19_3UTR$start, end=hg19_3UTR$end, strand=hg19_3UTR$strand,
	as.character=TRUE)
	# save to csv
	write.csv(hg19_3UTR, file='hg19_3UTR_raw.csv', row.names=FALSE) # file size ~ 80MB