Skip to content

Instantly share code, notes, and snippets.

@mdozmorov
Last active September 19, 2022 19:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mdozmorov/7f4393f90932fb6bd911c43c20425ca0 to your computer and use it in GitHub Desktop.
Save mdozmorov/7f4393f90932fb6bd911c43c20425ca0 to your computer and use it in GitHub Desktop.
T2T excluderanges download
# Download a list of problematic regions (aka blacklist) for the T2T-CHM13
# telomere-to-telomere human genome assembly. Defined by the Boyle-Lab/Blacklist
# software, High Signal and Low Mappability regions.
# See https://github.com/dozmorovlab/excluderanges for more information.
suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
# bedbase_id
bedbase_id <- "6548a002754cc1e882035293541b59a8"
# Construct output file name
fileNameOut <- "T2T.excluderanges.bed.gz"
# API token for BED data
token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed")
# Download file
GET(url = token2, write_disk(fileNameOut, overwrite = TRUE))
# Read the data in
T2T.excluderanges <- readr::read_tsv(fileNameOut,
col_names = FALSE,
col_types = c("cddcdc"))
# Assign column names depending on the number of columns
all_columns <- c("chr", "start", "end", "name", "score", "strand",
"signalValue", "pValue", "qValue", "peak")
colnames(T2T.excluderanges) <- all_columns[1:ncol(T2T.excluderanges)]
# Convert to GRanges object
T2T.excluderanges <- makeGRangesFromDataFrame(T2T.excluderanges,
keep.extra.columns = TRUE)
# Seqinfo for T2T genome
chrom_data <- GenomeInfoDb::getChromInfoFromNCBI(assembly = "T2T-CHM13v2.0",
assembled.molecules.only = TRUE) # GCA_009914755.4
chrom_data$AssignedMolecule <- as.character(paste0("chr", chrom_data$AssignedMolecule))
# Make the same format as UCSC chromosome data
chrom_data <- data.frame(chrom = chrom_data$AssignedMolecule,
size = chrom_data$SequenceLength,
assembled = ifelse(chrom_data$AssemblyUnit == "Primary Assembly", TRUE, FALSE),
circular = chrom_data$circular)
# Keep standard chromosomes
chromosomes_standard <- chrom_data$chrom
# Subset and match to chromosomes in the T2T.excluderanges object
# Common chromosomes
chromosomes_common <- intersect(chrom_data$chrom, seqlevels(T2T.excluderanges))
# Subset T2T.excluderanges
T2T.excluderanges <- keepSeqlevels(T2T.excluderanges, chromosomes_common,
pruning.mode = "tidy")
# Subset chrom_data
chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]
# Match objects
chrom_data <- chrom_data[match(seqlevels(T2T.excluderanges), chrom_data$chrom), ]
# Assign seqinfo data
seqlengths(T2T.excluderanges) <- chrom_data$size
isCircular(T2T.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
genome(T2T.excluderanges) <- "T2T-CHM13v2.0" # "GCA_009914755.4"
T2T.excluderanges
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment