Skip to content

Instantly share code, notes, and snippets.

@mdozmorov
Created September 19, 2022 23:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mdozmorov/33a1fa3234b2942dae238e1fcb39c996 to your computer and use it in GitHub Desktop.
Save mdozmorov/33a1fa3234b2942dae238e1fcb39c996 to your computer and use it in GitHub Desktop.
mm39 excluderanges download
# Download a list of problematic regions (aka blacklist) for the GRCm39/mm39
# mouse genome assembly. Defined by the Boyle-Lab/Blacklist
# software, High Signal and Low Mappability regions.
# See https://github.com/dozmorovlab/excluderanges for more information.
suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
# bedbase_id
bedbase_id <- "edc716833d4b5ee75c34a0692fc353d5"
# Construct output file name
fileNameOut <- "mm39.excluderanges.bed.gz"
# API token for BED data
token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed")
# Download file
GET(url = token2, write_disk(fileNameOut, overwrite = TRUE))
# Read the data in
mm39.excluderanges <- readr::read_tsv(fileNameOut,
col_names = FALSE,
col_types = c("cddcdc"))
# Assign column names depending on the number of columns
all_columns <- c("chr", "start", "end", "name", "score", "strand",
"signalValue", "pValue", "qValue", "peak")
colnames(mm39.excluderanges) <- all_columns[1:ncol(mm39.excluderanges)]
# Convert to GRanges object
mm39.excluderanges <- makeGRangesFromDataFrame(mm39.excluderanges,
keep.extra.columns = TRUE)
# Seqinfo for mm39 genome
chrom_data <- GenomeInfoDb::getChromInfoFromUCSC(genome = "mm39",
assembled.molecules.only = TRUE)
# Subset and match to chromosomes in the mm39.excluderanges object
# Common chromosomes
chromosomes_common <- intersect(chrom_data$chrom, seqlevels(mm39.excluderanges))
# Subset mm39.excluderanges
mm39.excluderanges <- keepSeqlevels(mm39.excluderanges, chromosomes_common,
pruning.mode = "tidy")
# Subset chrom_data
chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]
# Match objects
chrom_data <- chrom_data[match(seqlevels(mm39.excluderanges), chrom_data$chrom), ]
# Assign seqinfo data
seqlengths(mm39.excluderanges) <- chrom_data$size
isCircular(mm39.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
genome(mm39.excluderanges) <- "mm39"
mm39.excluderanges
@kcoutinh
Copy link

Hi! I tried this code and I needed to make col_names=TRUE here:
mm39.excluderanges <- readr::read_tsv(fileNameOut, col_names = FALSE, col_types = c("cddcdc"))

I also needed to swap score and strand here:
all_columns <- c("chr", "start", "end", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak")

@mdozmorov
Copy link
Author

Hi @kcoutinh, can you copy your complete code here? BedBase download also doesn't work, I wonder if you corrected the API.

@kcoutinh
Copy link

So I actually downloaded the mm39 from the google drive provided in your lab's exclude ranges vignette: https://dozmorovlab.github.io/excluderanges/#excluderanges-genomic-ranges-of-problematic-genomic-regions

@kcoutinh
Copy link

`# Construct output file name
fileNameOut <- "mm39.excluderanges.bed"

Read the data in

mm39.excluderanges <- readr::read_tsv(fileNameOut,
col_names = TRUE)

Assign column names depending on the number of columns

all_columns <- c("chr", "start", "end", "name", "strand","score",
"signalValue", "pValue", "qValue", "peak")
colnames(mm39.excluderanges) <- all_columns[1:ncol(mm39.excluderanges)]

Convert to GRanges object

mm39.excluderanges <- makeGRangesFromDataFrame(mm39.excluderanges,
keep.extra.columns = TRUE)

Seqinfo for mm39 genome

chrom_data <- GenomeInfoDb::getChromInfoFromUCSC(genome = "mm39",
assembled.molecules.only = TRUE)

Subset and match to chromosomes in the mm39.excluderanges object

Common chromosomes

chromosomes_common <- intersect(chrom_data$chrom, seqlevels(mm39.excluderanges))

Subset mm39.excluderanges

mm39.excluderanges <- keepSeqlevels(mm39.excluderanges, chromosomes_common,
pruning.mode = "tidy")

Subset chrom_data

chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]

Match objects

chrom_data <- chrom_data[match(seqlevels(mm39.excluderanges), chrom_data$chrom), ]

Assign seqinfo data

seqlengths(mm39.excluderanges) <- chrom_data$size
isCircular(mm39.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
genome(mm39.excluderanges) <- "mm39"

mm39.excluderanges

bed<- as.data.frame(mm39.excluderanges)

Rename columns

colnames(bed) <- c("chrom", "chromStart", "chromEnd", "width", "strand", "score", "name")

Reorder columns

bed_2 <- bed[, c("chrom", "chromStart", "chromEnd", "name", "score", "strand")]

write.table( bed, file = "mm39.excluderanges_2.bed",
sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)`

@mdozmorov
Copy link
Author

I believe, you got the file from Google Drive? mm39.excluderanges.bed. It doesn't have column names, so should be col_names = FALSE. Here's the head:

chr1	5494200	5506700	12501	*	High Signal Region
chr1	6063300	6069400	6101	*	High Signal Region
chr1	8728600	8802500	73901	*	High Signal Region

@kcoutinh
Copy link

Opes my bad. Here is the edited code.

`suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
library(readr)

bedbase_id

bedbase_id <- "edc716833d4b5ee75c34a0692fc353d5"

Construct output file name

fileNameOut <- "mm39.excluderanges.bed"

API token for BED data

#token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed")

Download file

#GET(url = token2, write_disk(fileNameOut, overwrite = TRUE))

Read the data in

mm39.excluderanges <- readr::read_tsv(fileNameOut,
col_names = FALSE)

Assign column names depending on the number of columns

all_columns <- c("chr", "start", "end", "name", "strand","score",
"signalValue", "pValue", "qValue", "peak")
colnames(mm39.excluderanges) <- all_columns[1:ncol(mm39.excluderanges)]

Convert to GRanges object

mm39.excluderanges <- makeGRangesFromDataFrame(mm39.excluderanges,
keep.extra.columns = TRUE)

Seqinfo for mm39 genome

chrom_data <- GenomeInfoDb::getChromInfoFromUCSC(genome = "mm39",
assembled.molecules.only = TRUE)

Subset and match to chromosomes in the mm39.excluderanges object

Common chromosomes

chromosomes_common <- intersect(chrom_data$chrom, seqlevels(mm39.excluderanges))

Subset mm39.excluderanges

mm39.excluderanges <- keepSeqlevels(mm39.excluderanges, chromosomes_common,
pruning.mode = "tidy")

Subset chrom_data

chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]

Match objects

chrom_data <- chrom_data[match(seqlevels(mm39.excluderanges), chrom_data$chrom), ]

Assign seqinfo data

seqlengths(mm39.excluderanges) <- chrom_data$size
isCircular(mm39.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
genome(mm39.excluderanges) <- "mm39"

mm39.excluderanges

bed<- as.data.frame(mm39.excluderanges)

Rename columns

colnames(bed) <- c("chrom", "chromStart", "chromEnd", "width", "strand", "score", "name")

Reorder columns

bed_2 <- bed[, c("chrom", "chromStart", "chromEnd", "name", "score", "strand")]

write.table( bed, file = "mm39.excluderanges_2.bed",
sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)`

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment