mdozmorov/gist_mm39_excluderanges.R

## gist_mm39_excluderanges.R
# Download a list of problematic regions (aka blacklist) for the GRCm39/mm39
# mouse genome assembly. Defined by the Boyle-Lab/Blacklist
# software, High Signal and Low Mappability regions.
# See https://github.com/dozmorovlab/excluderanges for more information.

suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
# bedbase_id
bedbase_id <- "edc716833d4b5ee75c34a0692fc353d5"
# Construct output file name
fileNameOut <- "mm39.excluderanges.bed.gz"
# API token for BED data
token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed")
# Download file
GET(url = token2, write_disk(fileNameOut, overwrite = TRUE))
# Read the data in
mm39.excluderanges <- readr::read_tsv(fileNameOut,
                                     col_names = FALSE,
                                     col_types = c("cddcdc"))
# Assign column names depending on the number of columns
all_columns <- c("chr", "start", "end", "name", "score", "strand",
                 "signalValue", "pValue", "qValue", "peak")
colnames(mm39.excluderanges) <- all_columns[1:ncol(mm39.excluderanges)]
# Convert to GRanges object
mm39.excluderanges <- makeGRangesFromDataFrame(mm39.excluderanges,
                                              keep.extra.columns = TRUE)
# Seqinfo for mm39 genome
chrom_data <- GenomeInfoDb::getChromInfoFromUCSC(genome = "mm39",
                                                 assembled.molecules.only = TRUE)
# Subset and match to chromosomes in the mm39.excluderanges object
# Common chromosomes
chromosomes_common <- intersect(chrom_data$chrom, seqlevels(mm39.excluderanges))
# Subset mm39.excluderanges
mm39.excluderanges <- keepSeqlevels(mm39.excluderanges, chromosomes_common,
                                    pruning.mode = "tidy")
# Subset chrom_data
chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]
# Match objects
chrom_data <- chrom_data[match(seqlevels(mm39.excluderanges), chrom_data$chrom), ]
# Assign seqinfo data
seqlengths(mm39.excluderanges) <- chrom_data$size
isCircular(mm39.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
genome(mm39.excluderanges)     <- "mm39"

mm39.excluderanges
	# Download a list of problematic regions (aka blacklist) for the GRCm39/mm39
	# mouse genome assembly. Defined by the Boyle-Lab/Blacklist
	# software, High Signal and Low Mappability regions.
	# See https://github.com/dozmorovlab/excluderanges for more information.

	suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
	suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
	# bedbase_id
	bedbase_id <- "edc716833d4b5ee75c34a0692fc353d5"
	# Construct output file name
	fileNameOut <- "mm39.excluderanges.bed.gz"
	# API token for BED data
	token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed")
	# Download file
	GET(url = token2, write_disk(fileNameOut, overwrite = TRUE))
	# Read the data in
	mm39.excluderanges <- readr::read_tsv(fileNameOut,
	col_names = FALSE,
	col_types = c("cddcdc"))
	# Assign column names depending on the number of columns
	all_columns <- c("chr", "start", "end", "name", "score", "strand",
	"signalValue", "pValue", "qValue", "peak")
	colnames(mm39.excluderanges) <- all_columns[1:ncol(mm39.excluderanges)]
	# Convert to GRanges object
	mm39.excluderanges <- makeGRangesFromDataFrame(mm39.excluderanges,
	keep.extra.columns = TRUE)
	# Seqinfo for mm39 genome
	chrom_data <- GenomeInfoDb::getChromInfoFromUCSC(genome = "mm39",
	assembled.molecules.only = TRUE)
	# Subset and match to chromosomes in the mm39.excluderanges object
	# Common chromosomes
	chromosomes_common <- intersect(chrom_data$chrom, seqlevels(mm39.excluderanges))
	# Subset mm39.excluderanges
	mm39.excluderanges <- keepSeqlevels(mm39.excluderanges, chromosomes_common,
	pruning.mode = "tidy")
	# Subset chrom_data
	chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]
	# Match objects
	chrom_data <- chrom_data[match(seqlevels(mm39.excluderanges), chrom_data$chrom), ]
	# Assign seqinfo data
	seqlengths(mm39.excluderanges) <- chrom_data$size
	isCircular(mm39.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
	genome(mm39.excluderanges) <- "mm39"

	mm39.excluderanges