mcanouil/annot_islets.R

## annot_islets.R
# # MIT License
#
# Copyright (c) 2024 Mickaël Canouil
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

#' Annotation Using Islets Regulome
#'
#' https://www.nature.com/articles/s41588-019-0457-0
#'
#' @param data A data.frem or data.table with a "chr" and "position" columns.
#' @param build Genome assembly to use, default is "GRCh38", *i.e.*, upgrade supprementary files to "GRCh38".
#' @param crossmap The path to "CrossMap.py" to upgrade genome assembly from "GRCh37" to "GRCh38"
#'     (from https://crossmap.readthedocs.io/en/latest/#installation).
#' @param chain_file Chain file to use with CrossMap to upgrade genome assembly from "GRCh37" to "GRCh38"
#'     (from http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz).
#'
#' @return data.table
#'
#' @import data.table
#' @importFrom utils download.file unzip
#' @importFrom readxl::read_excel
annot_islets <- function(
  data,
  build = "GRCh38",
  crossmap = "CrossMap.py",
  chain_file = "hg19ToHg38.over.chain.gz"
) {

  if (
    inherits(
      try(
        system(
          command = crossmap,
          intern = TRUE,
          ignore.stdout = TRUE,
          ignore.stderr = TRUE
        ),
        silent = TRUE
      ),
      "try-error"
    )
  ) {
    stop('Consider running: "pip3 install CrossMap" or go to https://crossmap.readthedocs.io/en/latest/#installation')
  }

  if (!file.exists(chain_file) & any(tolower(build) %in% c("grch38" , "hg38"))) {
    chain_file <- file.path(tempdir(), basename(chain_file))
    download.file(
      url = sprintf("http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/%s", basename(chain_file)),
      destfile = chain_file
    )
    message(sprintf(
      'Chain file has been downloaded from "%s" to: "%s"',
      sprintf("http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/%s", basename(chain_file)),
      chain_file
    ))
  }

  `:=` <- data.table::`:=`

  url <- "https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-019-0457-0/MediaObjects/41588_2019_457_MOESM4_ESM.zip"

  if (!all(c("chr", "position") %in% colnames(data))) {
    stop('"chr" and "position" must exists in "data"!')
  }

  if (!all(grepl("^chr", data[["chr"]]))) {
    stop('Chromosome name must be prefixed with "chr"!')
  }

  output_directory <- file.path(tempdir(), "atacseq_supp")
  dir.create(path = output_directory, recursive = TRUE, showWarnings = FALSE)
  on.exit(unlink(x = output_directory, recursive = TRUE, force = TRUE))

  annot_regions <- local({
    utils::download.file(url = url,  destfile = file.path(output_directory, "atacseq.zip"))
    utils::unzip(
      zipfile = file.path(output_directory, "atacseq.zip"),
      exdir = output_directory,
      overwrite = TRUE
    )
    unlink(file.path(output_directory, "__MACOSX"), recursive = TRUE)
    tmp_annot_regions <- data.table::rbindlist(lapply(
      X = list.files(
        path = file.path(output_directory, "Supplementary_Data_Sets"),
        pattern = paste(
          paste0("Supplementary Data Set ", c(1, 2, 5, 9), "\\."),
          collapse = "|"
        ),
        full.names = TRUE
      ),
      FUN = function(x) {
        out <- readxl::read_excel(path = x, skip = 1)
        out[["file"]] <- gsub(" ", "_", gsub("Supplementary Data Set (.).(.*).xlsx", "SDS\\1_\\2", basename(x)))

        if (!grepl("Chromosome", colnames(out)[1])) {
          out <- cbind(
            as.data.frame(
              x = data.table::tstrsplit(out[[1]], ":|-|,"),
              col.names = c("Chromosome  (hg19)", "Start  (hg19)", "End (hg19)"),
              check.names = FALSE
            ),
            out[, -1]
          )
        }

        out
      }
    ), fill = TRUE)[
      j = `:=`(
        "Islet regulatory element" =
          data.table::fifelse(is.na(`Islet regulatory element`), "NA", `Islet regulatory element`),
        "uid" = 1:.N,
        "Start  (hg19)" = as.integer(`Start  (hg19)`),
        "End (hg19)" = as.integer(`End (hg19)`)
      )
    ]

    colnames(tmp_annot_regions)[1:5] <- c("chr", "start", "end", "region", "file")

    if (any(tolower(build) %in% c("grch38" , "hg38"))) {
      output_file <- file.path(output_directory, "annot_regions_grch37.bed")
      data.table::fwrite(
        x = tmp_annot_regions[, c("chr", "start", "end", "region", "file", "uid")],
        file = output_file,
        sep = "\t",
        col.names = FALSE,
        quote = FALSE
      )

      system(
        command = paste(
          crossmap, "bed",
          chain_file,
          output_file,
          gsub("_grch37.bed$", "_grch38.bed", output_file)
        ),
        wait = TRUE
      )

      out <- merge(
        x = data.table::fread(
          file = gsub("_grch37.bed$", "_grch38.bed", output_file),
          col.names = c("chr", "start", "end", "region", "file", "uid")
        )[
          j = list(start = min(start), end = max(end)),
          by = c("uid", "region", "file", "chr")
        ],
        y = tmp_annot_regions[, -c("chr", "start", "end")],
        by = c("region", "file", "uid")
      )[j = .SD, .SDcols = colnames(tmp_annot_regions)]
    } else {
      out <- tmp_annot_regions
    }

    out[
      j = "genomic_region" := paste0(chr, ":", start, "-", end)
    ][
      j = lapply(.SD, function(x) {
        if (any(is.character(x))) {
          data.table::fifelse(x == "" | is.na(x), NA_character_, x)
        } else {
          x
        }
      })
    ]
  })

  data <- data.table::as.data.table(data)[j = rowid := 1:.N]

  out <- merge(
    x = data[, -c("chr", "position")],
    y = annot_regions[
      data,
      c(
        rowid = unique(rowid),
        lapply(.SD, function(x) paste(x, collapse = ";"))
      ),
      on = list(chr, start <= position, end >= position),
      .SDcols = setdiff(colnames(annot_regions), "rowid"),
      mult = "all",
      by = .EACHI
    ][j = -c(1:3)],
    by = "rowid"
  )[j = -c("rowid")]

  out
}
	# # MIT License
	#
	# Copyright (c) 2024 Mickaël Canouil
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.

	#' Annotation Using Islets Regulome
	#'
	#' https://www.nature.com/articles/s41588-019-0457-0
	#'
	#' @param data A data.frem or data.table with a "chr" and "position" columns.
	#' @param build Genome assembly to use, default is "GRCh38", i.e., upgrade supprementary files to "GRCh38".
	#' @param crossmap The path to "CrossMap.py" to upgrade genome assembly from "GRCh37" to "GRCh38"
	#' (from https://crossmap.readthedocs.io/en/latest/#installation).
	#' @param chain_file Chain file to use with CrossMap to upgrade genome assembly from "GRCh37" to "GRCh38"
	#' (from http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz).
	#'
	#' @return data.table
	#'
	#' @import data.table
	#' @importFrom utils download.file unzip
	#' @importFrom readxl::read_excel
	annot_islets <- function(
	data,
	build = "GRCh38",
	crossmap = "CrossMap.py",
	chain_file = "hg19ToHg38.over.chain.gz"
	) {

	if (
	inherits(
	try(
	system(
	command = crossmap,
	intern = TRUE,
	ignore.stdout = TRUE,
	ignore.stderr = TRUE
	),
	silent = TRUE
	),
	"try-error"
	)
	) {
	stop('Consider running: "pip3 install CrossMap" or go to https://crossmap.readthedocs.io/en/latest/#installation')
	}

	if (!file.exists(chain_file) & any(tolower(build) %in% c("grch38" , "hg38"))) {
	chain_file <- file.path(tempdir(), basename(chain_file))
	download.file(
	url = sprintf("http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/%s", basename(chain_file)),
	destfile = chain_file
	)
	message(sprintf(
	'Chain file has been downloaded from "%s" to: "%s"',
	sprintf("http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/%s", basename(chain_file)),
	chain_file
	))
	}

	`:=` <- data.table::`:=`

	url <- "https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-019-0457-0/MediaObjects/41588_2019_457_MOESM4_ESM.zip"

	if (!all(c("chr", "position") %in% colnames(data))) {
	stop('"chr" and "position" must exists in "data"!')
	}

	if (!all(grepl("^chr", data[["chr"]]))) {
	stop('Chromosome name must be prefixed with "chr"!')
	}

	output_directory <- file.path(tempdir(), "atacseq_supp")
	dir.create(path = output_directory, recursive = TRUE, showWarnings = FALSE)
	on.exit(unlink(x = output_directory, recursive = TRUE, force = TRUE))

	annot_regions <- local({
	utils::download.file(url = url, destfile = file.path(output_directory, "atacseq.zip"))
	utils::unzip(
	zipfile = file.path(output_directory, "atacseq.zip"),
	exdir = output_directory,
	overwrite = TRUE
	)
	unlink(file.path(output_directory, "__MACOSX"), recursive = TRUE)
	tmp_annot_regions <- data.table::rbindlist(lapply(
	X = list.files(
	path = file.path(output_directory, "Supplementary_Data_Sets"),
	pattern = paste(
	paste0("Supplementary Data Set ", c(1, 2, 5, 9), "\\."),
	collapse = "\|"
	),
	full.names = TRUE
	),
	FUN = function(x) {
	out <- readxl::read_excel(path = x, skip = 1)
	out[["file"]] <- gsub(" ", "_", gsub("Supplementary Data Set (.).(.*).xlsx", "SDS\\1_\\2", basename(x)))

	if (!grepl("Chromosome", colnames(out)[1])) {
	out <- cbind(
	as.data.frame(
	x = data.table::tstrsplit(out[[1]], ":\|-\|,"),
	col.names = c("Chromosome (hg19)", "Start (hg19)", "End (hg19)"),
	check.names = FALSE
	),
	out[, -1]
	)
	}

	out
	}
	), fill = TRUE)[
	j = `:=`(
	"Islet regulatory element" =
	data.table::fifelse(is.na(`Islet regulatory element`), "NA", `Islet regulatory element`),
	"uid" = 1:.N,
	"Start (hg19)" = as.integer(`Start (hg19)`),
	"End (hg19)" = as.integer(`End (hg19)`)
	)
	]

	colnames(tmp_annot_regions)[1:5] <- c("chr", "start", "end", "region", "file")

	if (any(tolower(build) %in% c("grch38" , "hg38"))) {
	output_file <- file.path(output_directory, "annot_regions_grch37.bed")
	data.table::fwrite(
	x = tmp_annot_regions[, c("chr", "start", "end", "region", "file", "uid")],
	file = output_file,
	sep = "\t",
	col.names = FALSE,
	quote = FALSE
	)

	system(
	command = paste(
	crossmap, "bed",
	chain_file,
	output_file,
	gsub("_grch37.bed$", "_grch38.bed", output_file)
	),
	wait = TRUE
	)

	out <- merge(
	x = data.table::fread(
	file = gsub("_grch37.bed$", "_grch38.bed", output_file),
	col.names = c("chr", "start", "end", "region", "file", "uid")
	)[
	j = list(start = min(start), end = max(end)),
	by = c("uid", "region", "file", "chr")
	],
	y = tmp_annot_regions[, -c("chr", "start", "end")],
	by = c("region", "file", "uid")
	)[j = .SD, .SDcols = colnames(tmp_annot_regions)]
	} else {
	out <- tmp_annot_regions
	}

	out[
	j = "genomic_region" := paste0(chr, ":", start, "-", end)
	][
	j = lapply(.SD, function(x) {
	if (any(is.character(x))) {
	data.table::fifelse(x == "" \| is.na(x), NA_character_, x)
	} else {
	x
	}
	})
	]
	})

	data <- data.table::as.data.table(data)[j = rowid := 1:.N]

	out <- merge(
	x = data[, -c("chr", "position")],
	y = annot_regions[
	data,
	c(
	rowid = unique(rowid),
	lapply(.SD, function(x) paste(x, collapse = ";"))
	),
	on = list(chr, start <= position, end >= position),
	.SDcols = setdiff(colnames(annot_regions), "rowid"),
	mult = "all",
	by = .EACHI
	][j = -c(1:3)],
	by = "rowid"
	)[j = -c("rowid")]

	out
	}