stephenturner/annotation1-symbol-to-genemodel.R

## annotation1-symbol-to-genemodel.R
# Basic work flows: from SYMBOL to gene model and sequence
# Orig from http://master.bioconductor.org/help/course-materials/2014/ISMB2014/brca1.R

library(org.Hs.eg.db)
eid <- select(org.Hs.eg.db, "BRCA1", "ENTREZID", "SYMBOL")[["ENTREZID"]]

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txid <- select(txdb, eid, "TXNAME", "GENEID")[["TXNAME"]]
cds <- cdsBy(txdb, by="tx", use.names=TRUE)
brca1cds <- cds[names(cds) %in% txid]

library(Gviz)
plotTracks(list(GenomeAxisTrack(), AnnotationTrack(brca1cds))

library(BSgenome.Hsapiens.UCSC.hg19)
genome <- BSgenome.Hsapiens.UCSC.hg19
tx_seq <- extractTranscriptSeqs(genome, brca1cds)
tx_seq

## annotation2-enhancer-snps-near-genes.R
# AnnotationHub work flow: evolutionarily conserved enhancer SNPs near genes on chr17
# Orig: http://master.bioconductor.org/help/course-materials/2014/ISMB2014/enhancer-snps.R

##
## 1. Retrieve enhancers, SNPs from AnnotationHub, gene coordinates
##    from TxDb.*; harmonize genome and chromosome names
##

library(AnnotationHub)
hub <- AnnotationHub()
hg19 <- hub[metadata(hub)$Genome %in% c("hg19", "GRCh37")]

## Enhancers, from ENCODE BroadHmm track
## - details of track
##   http://hgwdev.sdsc.edu/cgi-bin/hgTrackUi?db=hg19&g=wgEncodeBroadHmm
##   http://www.ncbi.nlm.nih.gov/pubmed/21441907
hmm <- hub$goldenpath.hg18.database.wgEncodeBroadHmm_0.0.1.RData
## wrong 'seqlengths' -- 'hg18' in title, but actually liftOver to hg19
library(BSgenome.Hsapiens.UCSC.hg19)
seqlevels(hmm, force=TRUE) <- paste0("chr", 1:22)
seqlengths(hmm) <-
    seqlengths(BSgenome.Hsapiens.UCSC.hg19)[names(seqlengths(hmm))]
genome(hmm) <- "hg19"
enhancers <- hmm[grep("Enhancer", hmm$name)]

## SNPs, e.g., from chr1 of CEU population
snps <- query(hub, "dbSNP.*17.*CEU")[[1]]
## true (length 1) SNPs with single ALT allele
snps <- snps[width(snps) == 1 & elementLengths(alt(snps)) == 1]
seqlevelsStyle(snps) <- "UCSC"
genome(snps) <- "hg19"

## gene coordinates
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
genes <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene)
genes <- genes[seqnames(genes) == "chr17"]

##
## 2. Download (large!) conservation track as BED file from UCSC,
##    query for chromosome 1 using rtracklayer
##

library(rtracklayer)
url <- paste("http://hgdownload.cse.ucsc.edu/goldenPath/hg19",
             "phastCons100way/hg19.100way.phastCons.bw", sep="/")
dest <- "~/ISMB-2014/resources/hg19.100way.phastCons.bw"
## download.file(url, dest)
bw <- BigWigFile(dest)
chr17 <- as(seqinfo(BigWigFile(dest)), "GRanges")["chr17"]
score <- import(bw, which=chr17, as="NumericList")[["chr17"]]

##
## 4. subset SNPs by enhancers
##

enhancerSnps <- subsetByOverlaps(snps, enhancers)

##
## 5. Annotate enhancer SNPs with evolutionary conservation score
##

score <- score[start(enhancerSnps)]
## info(enhancerSnps)$score <- score[start(enhancerSnps)]

##
## 6. find nearest and distanceToNearest genes
##

hits <- distanceToNearest(enhancerSnps, genes)
df <- data.frame(
    Near=names(genes)[subjectHits(hits)],
    Distance=mcols(hits)$distance,
    Score=score, stringsAsFactors=FALSE)
	# Basic work flows: from SYMBOL to gene model and sequence
	# Orig from http://master.bioconductor.org/help/course-materials/2014/ISMB2014/brca1.R

	library(org.Hs.eg.db)
	eid <- select(org.Hs.eg.db, "BRCA1", "ENTREZID", "SYMBOL")[["ENTREZID"]]

	library(TxDb.Hsapiens.UCSC.hg19.knownGene)
	txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
	txid <- select(txdb, eid, "TXNAME", "GENEID")[["TXNAME"]]
	cds <- cdsBy(txdb, by="tx", use.names=TRUE)
	brca1cds <- cds[names(cds) %in% txid]

	library(Gviz)
	plotTracks(list(GenomeAxisTrack(), AnnotationTrack(brca1cds))

	library(BSgenome.Hsapiens.UCSC.hg19)
	genome <- BSgenome.Hsapiens.UCSC.hg19
	tx_seq <- extractTranscriptSeqs(genome, brca1cds)
	tx_seq
	# AnnotationHub work flow: evolutionarily conserved enhancer SNPs near genes on chr17
	# Orig: http://master.bioconductor.org/help/course-materials/2014/ISMB2014/enhancer-snps.R

	##
	## 1. Retrieve enhancers, SNPs from AnnotationHub, gene coordinates
	## from TxDb.*; harmonize genome and chromosome names
	##

	library(AnnotationHub)
	hub <- AnnotationHub()
	hg19 <- hub[metadata(hub)$Genome %in% c("hg19", "GRCh37")]

	## Enhancers, from ENCODE BroadHmm track
	## - details of track
	## http://hgwdev.sdsc.edu/cgi-bin/hgTrackUi?db=hg19&g=wgEncodeBroadHmm
	## http://www.ncbi.nlm.nih.gov/pubmed/21441907
	hmm <- hub$goldenpath.hg18.database.wgEncodeBroadHmm_0.0.1.RData
	## wrong 'seqlengths' -- 'hg18' in title, but actually liftOver to hg19
	library(BSgenome.Hsapiens.UCSC.hg19)
	seqlevels(hmm, force=TRUE) <- paste0("chr", 1:22)
	seqlengths(hmm) <-
	seqlengths(BSgenome.Hsapiens.UCSC.hg19)[names(seqlengths(hmm))]
	genome(hmm) <- "hg19"
	enhancers <- hmm[grep("Enhancer", hmm$name)]

	## SNPs, e.g., from chr1 of CEU population
	snps <- query(hub, "dbSNP.17.CEU")[[1]]
	## true (length 1) SNPs with single ALT allele
	snps <- snps[width(snps) == 1 & elementLengths(alt(snps)) == 1]
	seqlevelsStyle(snps) <- "UCSC"
	genome(snps) <- "hg19"

	## gene coordinates
	library(TxDb.Hsapiens.UCSC.hg19.knownGene)
	genes <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene)
	genes <- genes[seqnames(genes) == "chr17"]

	##
	## 2. Download (large!) conservation track as BED file from UCSC,
	## query for chromosome 1 using rtracklayer
	##

	library(rtracklayer)
	url <- paste("http://hgdownload.cse.ucsc.edu/goldenPath/hg19",
	"phastCons100way/hg19.100way.phastCons.bw", sep="/")
	dest <- "~/ISMB-2014/resources/hg19.100way.phastCons.bw"
	## download.file(url, dest)
	bw <- BigWigFile(dest)
	chr17 <- as(seqinfo(BigWigFile(dest)), "GRanges")["chr17"]
	score <- import(bw, which=chr17, as="NumericList")[["chr17"]]

	##
	## 4. subset SNPs by enhancers
	##

	enhancerSnps <- subsetByOverlaps(snps, enhancers)

	##
	## 5. Annotate enhancer SNPs with evolutionary conservation score
	##

	score <- score[start(enhancerSnps)]
	## info(enhancerSnps)$score <- score[start(enhancerSnps)]

	##
	## 6. find nearest and distanceToNearest genes
	##

	hits <- distanceToNearest(enhancerSnps, genes)
	df <- data.frame(
	Near=names(genes)[subjectHits(hits)],
	Distance=mcols(hits)$distance,
	Score=score, stringsAsFactors=FALSE)