biocyberman/annotateBed.R

## annotateBed.R
#!/usr/bin/env Rscript
## Date: Tue Aug 7 10:43:27 2018
## Last modified Time-stamp: <2018-08-10 12:54:38 CEST (vql)>
## Author: Vang Quy Le
## Organization: Aalborg University Hospital

## Copyright 2018 Vang Quy Le
##
##  This file is free software: you may copy, redistribute and/or modify it
##  under the terms of the GNU General Public License as published by the
##  Free Software Foundation, either version 2 of the License, or (at your
##  option) any later version.
##
##  This file is distributed in the hope that it will be useful, but
##  WITHOUT ANY WARRANTY; without even the implied warranty of
##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
##  General Public License for more details.
##
##  You should have received a copy of the GNU General Public License
##  along with this program.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
## INTRO
## Clean up "Name" column of a BED file (CREv2.bed) by annotating the intervals
## with gene names. The script also tries to add strand information whenever
## possible.
################################################################################

################################################################################
## LIBRARIES
################################################################################

library(Homo.sapiens)
library(GenomicRanges)
library(GenomicFeatures)

################################################################################
## FUNCTIONS
################################################################################

geneRanges2 <- function(){
  ## Discussed here: https://support.bioconductor.org/p/111849/#111853
  uh <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene, single.strand.genes.only = FALSE)
  uh <- unlist(uh)
  mcols(uh) <- mapIds(org.Hs.eg.db, names(uh), "SYMBOL", "ENTREZID")
  names(mcols(uh)) <- "SYMBOL"
  uh
}

geneRanges3 <- function(){
  library(rtracklayer)
  gtffile <- "Homo_sapiens.GRCh37.87.gff3"
  gtf <- rtracklayer::import(gtffile, format = "gff3",
                            feature.type = "gene", version = "3")
  hg19txdbensGene <- gtf
  ## hg19txdbensGene <- makeTxDbFromEnsembl(organism="Homo sapiens",
  ##                     release=93,
  ##                     circ_seqs=DEFAULT_CIRC_SEQS,
  ##                     server="ensembldb.ensembl.org")
  seqlevelsStyle(hg19txdbensGene) <- "UCSC"
  hg19txdbensGene <- keepStandardChromosomes(hg19txdbensGene,
                                             species = "Homo sapiens",
                                             pruning.mode = "tidy")
  names(mcols(hg19txdbensGene))[6] <- "SYMBOL"
  hg19txdbensGene
}

## This is discussed here: https://support.bioconductor.org/p/67118/
geneRanges <- function(db, column="ENTREZID")
{
  g <- genes(db, columns=column)
  col <- mcols(g)[[column]]
  genes <- granges(g)[rep(seq_along(g), elementNROWS(col))]
  mcols(genes)[[column]] <- as.character(unlist(col))
  genes
}


## query: the gene range
## subject: the bed file
splitColumnByOverlap <-function(query, subject, column="ENTREZID", ...)
{
  olaps <- findOverlaps(query, subject, ...)
  f1 <- factor(subjectHits(olaps),
               levels=seq_len(subjectLength(olaps)))
  splitAsList(mcols(query)[[column]][queryHits(olaps)], f1)
}

mapBedtoGene <-function(genes, bed, column="ENTREZID", ...)
{
    olaps <- findOverlaps(genes, bed, ...)
    f1 <- factor(subjectHits(olaps),
                 levels=seq_len(subjectLength(olaps)))
    out <- bed[subjectHits(olaps)]
    mcols(out)[["oldname"]] <- mcols(out)[["name"]]
    mcols(out)[["name"]] <-  mcols(genes)[[column]][queryHits(olaps)]
    strand(out) <- strand(genes)[queryHits(olaps)]
    nohits <- bed[-subjectHits(olaps)]

    ## Fix names
    oldnames <- mcols(nohits)[["name"]]
    mcols(nohits)[["oldname"]] <- oldnames
     oldnames <- gsub("(ref|ens)\\|([A-Za-z0-9-]+),.*", "\\2", oldnames, perl = T)
    ## oldnames <- gsub("^(.*)$", "\\1-NoHit", oldnames, perl = T)
    mcols(nohits)[["name"]] <- gsub("(ref|ens)\\|([A-Za-z0-9-]+)$", "\\2",
                                    oldnames, perl = T)
    rtracklayer::export(nohits, "NoHits.bed")
    write.table(as.data.frame(nohits), "NoHits.tsv",
                quote = F, sep = "\t",
                col.names = TRUE, row.names = F)
    out <- c(out, nohits)
    out <- sortSeqlevels(out)
    sort(out, by = ~ seqnames + start + end)
}

################################################################################
## MAIN
################################################################################
mybed <- rtracklayer::import("./CREv2.bed")
## gr <- geneRanges(Homo.sapiens, column = "SYMBOL")
gr <- geneRanges2()
bedcom <- mapBedtoGene(gr, mybed, column = "SYMBOL")

rtracklayer::export(bedcom, "Annotated_CREv2.bed")
start(bedcom) <- start(bedcom) - 1 # Make it ready as BED
write.table(as.data.frame(bedcom), "Annotated_CREv2.tsv",
            quote = F, sep = "\t", col.names = TRUE, row.names = F)
	#!/usr/bin/env Rscript
	## Date: Tue Aug 7 10:43:27 2018
	## Last modified Time-stamp: <2018-08-10 12:54:38 CEST (vql)>
	## Author: Vang Quy Le
	## Organization: Aalborg University Hospital

	## Copyright 2018 Vang Quy Le
	##
	## This file is free software: you may copy, redistribute and/or modify it
	## under the terms of the GNU General Public License as published by the
	## Free Software Foundation, either version 2 of the License, or (at your
	## option) any later version.
	##
	## This file is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with this program. If not, see <http://www.gnu.org/licenses/>.
	################################################################################
	## INTRO
	## Clean up "Name" column of a BED file (CREv2.bed) by annotating the intervals
	## with gene names. The script also tries to add strand information whenever
	## possible.
	################################################################################

	################################################################################
	## LIBRARIES
	################################################################################

	library(Homo.sapiens)
	library(GenomicRanges)
	library(GenomicFeatures)

	################################################################################
	## FUNCTIONS
	################################################################################

	geneRanges2 <- function(){
	## Discussed here: https://support.bioconductor.org/p/111849/#111853
	uh <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene, single.strand.genes.only = FALSE)
	uh <- unlist(uh)
	mcols(uh) <- mapIds(org.Hs.eg.db, names(uh), "SYMBOL", "ENTREZID")
	names(mcols(uh)) <- "SYMBOL"
	uh
	}

	geneRanges3 <- function(){
	library(rtracklayer)
	gtffile <- "Homo_sapiens.GRCh37.87.gff3"
	gtf <- rtracklayer::import(gtffile, format = "gff3",
	feature.type = "gene", version = "3")
	hg19txdbensGene <- gtf
	## hg19txdbensGene <- makeTxDbFromEnsembl(organism="Homo sapiens",
	## release=93,
	## circ_seqs=DEFAULT_CIRC_SEQS,
	## server="ensembldb.ensembl.org")
	seqlevelsStyle(hg19txdbensGene) <- "UCSC"
	hg19txdbensGene <- keepStandardChromosomes(hg19txdbensGene,
	species = "Homo sapiens",
	pruning.mode = "tidy")
	names(mcols(hg19txdbensGene))[6] <- "SYMBOL"
	hg19txdbensGene
	}

	## This is discussed here: https://support.bioconductor.org/p/67118/
	geneRanges <- function(db, column="ENTREZID")
	{
	g <- genes(db, columns=column)
	col <- mcols(g)[[column]]
	genes <- granges(g)[rep(seq_along(g), elementNROWS(col))]
	mcols(genes)[[column]] <- as.character(unlist(col))
	genes
	}


	## query: the gene range
	## subject: the bed file
	splitColumnByOverlap <-function(query, subject, column="ENTREZID", ...)
	{
	olaps <- findOverlaps(query, subject, ...)
	f1 <- factor(subjectHits(olaps),
	levels=seq_len(subjectLength(olaps)))
	splitAsList(mcols(query)[[column]][queryHits(olaps)], f1)
	}

	mapBedtoGene <-function(genes, bed, column="ENTREZID", ...)
	{
	olaps <- findOverlaps(genes, bed, ...)
	f1 <- factor(subjectHits(olaps),
	levels=seq_len(subjectLength(olaps)))
	out <- bed[subjectHits(olaps)]
	mcols(out)[["oldname"]] <- mcols(out)[["name"]]
	mcols(out)[["name"]] <- mcols(genes)[[column]][queryHits(olaps)]
	strand(out) <- strand(genes)[queryHits(olaps)]
	nohits <- bed[-subjectHits(olaps)]

	## Fix names
	oldnames <- mcols(nohits)[["name"]]
	mcols(nohits)[["oldname"]] <- oldnames
	oldnames <- gsub("(ref\|ens)\\\|([A-Za-z0-9-]+),.*", "\\2", oldnames, perl = T)
	## oldnames <- gsub("^(.*)$", "\\1-NoHit", oldnames, perl = T)
	mcols(nohits)[["name"]] <- gsub("(ref\|ens)\\\|([A-Za-z0-9-]+)$", "\\2",
	oldnames, perl = T)
	rtracklayer::export(nohits, "NoHits.bed")
	write.table(as.data.frame(nohits), "NoHits.tsv",
	quote = F, sep = "\t",
	col.names = TRUE, row.names = F)
	out <- c(out, nohits)
	out <- sortSeqlevels(out)
	sort(out, by = ~ seqnames + start + end)
	}

	################################################################################
	## MAIN
	################################################################################
	mybed <- rtracklayer::import("./CREv2.bed")
	## gr <- geneRanges(Homo.sapiens, column = "SYMBOL")
	gr <- geneRanges2()
	bedcom <- mapBedtoGene(gr, mybed, column = "SYMBOL")

	rtracklayer::export(bedcom, "Annotated_CREv2.bed")
	start(bedcom) <- start(bedcom) - 1 # Make it ready as BED
	write.table(as.data.frame(bedcom), "Annotated_CREv2.tsv",
	quote = F, sep = "\t", col.names = TRUE, row.names = F)