mdozmorov/gist_T2T_excluderanges.R

## gist_T2T_excluderanges.R
# Download a list of problematic regions (aka blacklist) for the T2T-CHM13
# telomere-to-telomere human genome assembly. Defined by the Boyle-Lab/Blacklist
# software, High Signal and Low Mappability regions.
# See https://github.com/dozmorovlab/excluderanges for more information.

suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
# bedbase_id
bedbase_id <- "6548a002754cc1e882035293541b59a8"
# Construct output file name
fileNameOut <- "T2T.excluderanges.bed.gz"
# API token for BED data
token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed")
# Download file
GET(url = token2, write_disk(fileNameOut, overwrite = TRUE))
# Read the data in
T2T.excluderanges <- readr::read_tsv(fileNameOut,
                                     col_names = FALSE,
                                     col_types = c("cddcdc"))
# Assign column names depending on the number of columns
all_columns <- c("chr", "start", "end", "name", "score", "strand",
                 "signalValue", "pValue", "qValue", "peak")
colnames(T2T.excluderanges) <- all_columns[1:ncol(T2T.excluderanges)]
# Convert to GRanges object
T2T.excluderanges <- makeGRangesFromDataFrame(T2T.excluderanges,
                                              keep.extra.columns = TRUE)
# Seqinfo for T2T genome
chrom_data <- GenomeInfoDb::getChromInfoFromNCBI(assembly = "T2T-CHM13v2.0",
                                                 assembled.molecules.only = TRUE) # GCA_009914755.4
chrom_data$AssignedMolecule <- as.character(paste0("chr", chrom_data$AssignedMolecule))
# Make the same format as UCSC chromosome data
chrom_data <- data.frame(chrom = chrom_data$AssignedMolecule,
                         size = chrom_data$SequenceLength,
                         assembled = ifelse(chrom_data$AssemblyUnit == "Primary Assembly", TRUE, FALSE),
                         circular = chrom_data$circular)
# Keep standard chromosomes
chromosomes_standard <- chrom_data$chrom
# Subset and match to chromosomes in the T2T.excluderanges object
# Common chromosomes
chromosomes_common <- intersect(chrom_data$chrom, seqlevels(T2T.excluderanges))
# Subset T2T.excluderanges
T2T.excluderanges <- keepSeqlevels(T2T.excluderanges, chromosomes_common,
                                   pruning.mode = "tidy")
# Subset chrom_data
chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]
# Match objects
chrom_data <- chrom_data[match(seqlevels(T2T.excluderanges), chrom_data$chrom), ]
# Assign seqinfo data
seqlengths(T2T.excluderanges) <- chrom_data$size
isCircular(T2T.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
genome(T2T.excluderanges)     <- "T2T-CHM13v2.0" # "GCA_009914755.4"

T2T.excluderanges
	# Download a list of problematic regions (aka blacklist) for the T2T-CHM13
	# telomere-to-telomere human genome assembly. Defined by the Boyle-Lab/Blacklist
	# software, High Signal and Low Mappability regions.
	# See https://github.com/dozmorovlab/excluderanges for more information.

	suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
	suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
	# bedbase_id
	bedbase_id <- "6548a002754cc1e882035293541b59a8"
	# Construct output file name
	fileNameOut <- "T2T.excluderanges.bed.gz"
	# API token for BED data
	token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed")
	# Download file
	GET(url = token2, write_disk(fileNameOut, overwrite = TRUE))
	# Read the data in
	T2T.excluderanges <- readr::read_tsv(fileNameOut,
	col_names = FALSE,
	col_types = c("cddcdc"))
	# Assign column names depending on the number of columns
	all_columns <- c("chr", "start", "end", "name", "score", "strand",
	"signalValue", "pValue", "qValue", "peak")
	colnames(T2T.excluderanges) <- all_columns[1:ncol(T2T.excluderanges)]
	# Convert to GRanges object
	T2T.excluderanges <- makeGRangesFromDataFrame(T2T.excluderanges,
	keep.extra.columns = TRUE)
	# Seqinfo for T2T genome
	chrom_data <- GenomeInfoDb::getChromInfoFromNCBI(assembly = "T2T-CHM13v2.0",
	assembled.molecules.only = TRUE) # GCA_009914755.4
	chrom_data$AssignedMolecule <- as.character(paste0("chr", chrom_data$AssignedMolecule))
	# Make the same format as UCSC chromosome data
	chrom_data <- data.frame(chrom = chrom_data$AssignedMolecule,
	size = chrom_data$SequenceLength,
	assembled = ifelse(chrom_data$AssemblyUnit == "Primary Assembly", TRUE, FALSE),
	circular = chrom_data$circular)
	# Keep standard chromosomes
	chromosomes_standard <- chrom_data$chrom
	# Subset and match to chromosomes in the T2T.excluderanges object
	# Common chromosomes
	chromosomes_common <- intersect(chrom_data$chrom, seqlevels(T2T.excluderanges))
	# Subset T2T.excluderanges
	T2T.excluderanges <- keepSeqlevels(T2T.excluderanges, chromosomes_common,
	pruning.mode = "tidy")
	# Subset chrom_data
	chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]
	# Match objects
	chrom_data <- chrom_data[match(seqlevels(T2T.excluderanges), chrom_data$chrom), ]
	# Assign seqinfo data
	seqlengths(T2T.excluderanges) <- chrom_data$size
	isCircular(T2T.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
	genome(T2T.excluderanges) <- "T2T-CHM13v2.0" # "GCA_009914755.4"

	T2T.excluderanges