Last active
December 19, 2015 15:48
-
-
Save ccwang002/5978498 to your computer and use it in GitHub Desktop.
Retreive all human(hg19) 3' UTR sequences and their correspondent gene names thru R/Bioconductor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Code here is adapted from | |
# http://gettinggeneticsdone.blogspot.tw/2011/04/using-rstats-bioconductor-to-get.html | |
# If Bioconductor or the required packages (e.g., BSgenome) are not installed, | |
# One should first run the following code section "Installation" | |
source("http://www.bioconductor.org/biocLite.R") | |
# === Installation === | |
# --- known gene --- | |
# biocLite('TxDb.Hsapiens.UCSC.hg19.knownGene') | |
# --- gene id conversion --- | |
# biocLite("org.Hs.eg.db") | |
# biocLite("annotate") | |
# --- reference sequence --- | |
# Load the BSgenome package and download the hg19 reference sequence | |
# For documentation see http://www.bioconductor.org/packages/release/bioc/html/BSgenome.html | |
# biocLite("BSgenome") | |
# biocLite("BSgenome.Hsapiens.UCSC.hg19") #installs the human genome (~850 MB download). | |
# === Gene Annotation === | |
# get all known genes | |
library(TxDb.Hsapiens.UCSC.hg19.knownGene, quietly = TRUE) | |
txdb_hg19 <- TxDb.Hsapiens.UCSC.hg19.knownGene | |
# get 3' UTR info of all known genes | |
hg19_3UTR <- as.data.frame(threeUTRsByTranscript(txdb_hg19, use.names=TRUE)) | |
# convert gene id into gene symbol name | |
library(org.Hs.eg.db) | |
library(annotate) | |
library(plyr) | |
names(hg19_3UTR)[[1]] <- 'UCSCKG_ID' # change 'element' into more meaningful name | |
# select(...) is a DB access function. | |
# It selects gene symbol of Human DB based on colomn 'UCSCKG' with keys in 3' UTR gene ID (UCSCKG_ID), | |
# so it returns a table with three colomns: UCSCKG, SYMBOL, and ENTREZID | |
gene_id_table <-select(org.Hs.eg.db, hg19_3UTR$UCSCKG_ID, | |
cols=c("SYMBOL", "ENTREZID"), | |
keytype="UCSCKG") | |
# copy the query table to main 3' UTR table per column | |
hg19_3UTR$GENE_SYMBOL <- gene_id_table$SYMBOL | |
hg19_3UTR$ENTREZ_ID <- gene_id_table$ENTREZID | |
# reorder data frame | |
hg19_3UTR <- hg19_3UTR[, c(10, 11, 1:9)] | |
# === Reference Sequence === | |
library(BSgenome.Hsapiens.UCSC.hg19) | |
Hsapiens # now the genome reference has been loaded | |
library(plyr) | |
# Test the function by use part of the 3' UTR data | |
# partial <- hg19_3UTR[1:10,] | |
# partial$seq <- getSeq(Hsapiens, partial$seqnames, | |
# start=partial$start, end=partial$end, strand=partial$strand, | |
# as.character=TRUE) | |
# retrieve all sequence | |
hg19_3UTR$sequence <- getSeq(Hsapiens, hg19_3UTR$seqnames, | |
start=hg19_3UTR$start, end=hg19_3UTR$end, strand=hg19_3UTR$strand, | |
as.character=TRUE) | |
# save to csv | |
write.csv(hg19_3UTR, file='hg19_3UTR_raw.csv', row.names=FALSE) # file size ~ 80MB |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment