Skip to content

Instantly share code, notes, and snippets.

@biocyberman
Created August 10, 2018 11:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save biocyberman/50ff17aabebc54f8953437f3529b1aeb to your computer and use it in GitHub Desktop.
Save biocyberman/50ff17aabebc54f8953437f3529b1aeb to your computer and use it in GitHub Desktop.
#!/usr/bin/env Rscript
## Date: Tue Aug 7 10:43:27 2018
## Last modified Time-stamp: <2018-08-10 12:54:38 CEST (vql)>
## Author: Vang Quy Le
## Organization: Aalborg University Hospital
## Copyright 2018 Vang Quy Le
##
## This file is free software: you may copy, redistribute and/or modify it
## under the terms of the GNU General Public License as published by the
## Free Software Foundation, either version 2 of the License, or (at your
## option) any later version.
##
## This file is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
################################################################################
## INTRO
## Clean up "Name" column of a BED file (CREv2.bed) by annotating the intervals
## with gene names. The script also tries to add strand information whenever
## possible.
################################################################################
################################################################################
## LIBRARIES
################################################################################
library(Homo.sapiens)
library(GenomicRanges)
library(GenomicFeatures)
################################################################################
## FUNCTIONS
################################################################################
geneRanges2 <- function(){
## Discussed here: https://support.bioconductor.org/p/111849/#111853
uh <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene, single.strand.genes.only = FALSE)
uh <- unlist(uh)
mcols(uh) <- mapIds(org.Hs.eg.db, names(uh), "SYMBOL", "ENTREZID")
names(mcols(uh)) <- "SYMBOL"
uh
}
geneRanges3 <- function(){
library(rtracklayer)
gtffile <- "Homo_sapiens.GRCh37.87.gff3"
gtf <- rtracklayer::import(gtffile, format = "gff3",
feature.type = "gene", version = "3")
hg19txdbensGene <- gtf
## hg19txdbensGene <- makeTxDbFromEnsembl(organism="Homo sapiens",
## release=93,
## circ_seqs=DEFAULT_CIRC_SEQS,
## server="ensembldb.ensembl.org")
seqlevelsStyle(hg19txdbensGene) <- "UCSC"
hg19txdbensGene <- keepStandardChromosomes(hg19txdbensGene,
species = "Homo sapiens",
pruning.mode = "tidy")
names(mcols(hg19txdbensGene))[6] <- "SYMBOL"
hg19txdbensGene
}
## This is discussed here: https://support.bioconductor.org/p/67118/
geneRanges <- function(db, column="ENTREZID")
{
g <- genes(db, columns=column)
col <- mcols(g)[[column]]
genes <- granges(g)[rep(seq_along(g), elementNROWS(col))]
mcols(genes)[[column]] <- as.character(unlist(col))
genes
}
## query: the gene range
## subject: the bed file
splitColumnByOverlap <-function(query, subject, column="ENTREZID", ...)
{
olaps <- findOverlaps(query, subject, ...)
f1 <- factor(subjectHits(olaps),
levels=seq_len(subjectLength(olaps)))
splitAsList(mcols(query)[[column]][queryHits(olaps)], f1)
}
mapBedtoGene <-function(genes, bed, column="ENTREZID", ...)
{
olaps <- findOverlaps(genes, bed, ...)
f1 <- factor(subjectHits(olaps),
levels=seq_len(subjectLength(olaps)))
out <- bed[subjectHits(olaps)]
mcols(out)[["oldname"]] <- mcols(out)[["name"]]
mcols(out)[["name"]] <- mcols(genes)[[column]][queryHits(olaps)]
strand(out) <- strand(genes)[queryHits(olaps)]
nohits <- bed[-subjectHits(olaps)]
## Fix names
oldnames <- mcols(nohits)[["name"]]
mcols(nohits)[["oldname"]] <- oldnames
oldnames <- gsub("(ref|ens)\\|([A-Za-z0-9-]+),.*", "\\2", oldnames, perl = T)
## oldnames <- gsub("^(.*)$", "\\1-NoHit", oldnames, perl = T)
mcols(nohits)[["name"]] <- gsub("(ref|ens)\\|([A-Za-z0-9-]+)$", "\\2",
oldnames, perl = T)
rtracklayer::export(nohits, "NoHits.bed")
write.table(as.data.frame(nohits), "NoHits.tsv",
quote = F, sep = "\t",
col.names = TRUE, row.names = F)
out <- c(out, nohits)
out <- sortSeqlevels(out)
sort(out, by = ~ seqnames + start + end)
}
################################################################################
## MAIN
################################################################################
mybed <- rtracklayer::import("./CREv2.bed")
## gr <- geneRanges(Homo.sapiens, column = "SYMBOL")
gr <- geneRanges2()
bedcom <- mapBedtoGene(gr, mybed, column = "SYMBOL")
rtracklayer::export(bedcom, "Annotated_CREv2.bed")
start(bedcom) <- start(bedcom) - 1 # Make it ready as BED
write.table(as.data.frame(bedcom), "Annotated_CREv2.tsv",
quote = F, sep = "\t", col.names = TRUE, row.names = F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment