Skip to content

Instantly share code, notes, and snippets.

@mcanouil
Last active February 2, 2024 23:34
Show Gist options
  • Save mcanouil/a4c4c2f1182cdea69d48bd095f8ec119 to your computer and use it in GitHub Desktop.
Save mcanouil/a4c4c2f1182cdea69d48bd095f8ec119 to your computer and use it in GitHub Desktop.
Annotation Using Islets Regulome (https://www.nature.com/articles/s41588-019-0457-0)
# # MIT License
#
# Copyright (c) 2024 Mickaël Canouil
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#' Annotation Using Islets Regulome
#'
#' https://www.nature.com/articles/s41588-019-0457-0
#'
#' @param data A data.frem or data.table with a "chr" and "position" columns.
#' @param build Genome assembly to use, default is "GRCh38", *i.e.*, upgrade supprementary files to "GRCh38".
#' @param crossmap The path to "CrossMap.py" to upgrade genome assembly from "GRCh37" to "GRCh38"
#' (from https://crossmap.readthedocs.io/en/latest/#installation).
#' @param chain_file Chain file to use with CrossMap to upgrade genome assembly from "GRCh37" to "GRCh38"
#' (from http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz).
#'
#' @return data.table
#'
#' @import data.table
#' @importFrom utils download.file unzip
#' @importFrom readxl::read_excel
annot_islets <- function(
data,
build = "GRCh38",
crossmap = "CrossMap.py",
chain_file = "hg19ToHg38.over.chain.gz"
) {
if (
inherits(
try(
system(
command = crossmap,
intern = TRUE,
ignore.stdout = TRUE,
ignore.stderr = TRUE
),
silent = TRUE
),
"try-error"
)
) {
stop('Consider running: "pip3 install CrossMap" or go to https://crossmap.readthedocs.io/en/latest/#installation')
}
if (!file.exists(chain_file) & any(tolower(build) %in% c("grch38" , "hg38"))) {
chain_file <- file.path(tempdir(), basename(chain_file))
download.file(
url = sprintf("http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/%s", basename(chain_file)),
destfile = chain_file
)
message(sprintf(
'Chain file has been downloaded from "%s" to: "%s"',
sprintf("http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/%s", basename(chain_file)),
chain_file
))
}
`:=` <- data.table::`:=`
url <- "https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-019-0457-0/MediaObjects/41588_2019_457_MOESM4_ESM.zip"
if (!all(c("chr", "position") %in% colnames(data))) {
stop('"chr" and "position" must exists in "data"!')
}
if (!all(grepl("^chr", data[["chr"]]))) {
stop('Chromosome name must be prefixed with "chr"!')
}
output_directory <- file.path(tempdir(), "atacseq_supp")
dir.create(path = output_directory, recursive = TRUE, showWarnings = FALSE)
on.exit(unlink(x = output_directory, recursive = TRUE, force = TRUE))
annot_regions <- local({
utils::download.file(url = url, destfile = file.path(output_directory, "atacseq.zip"))
utils::unzip(
zipfile = file.path(output_directory, "atacseq.zip"),
exdir = output_directory,
overwrite = TRUE
)
unlink(file.path(output_directory, "__MACOSX"), recursive = TRUE)
tmp_annot_regions <- data.table::rbindlist(lapply(
X = list.files(
path = file.path(output_directory, "Supplementary_Data_Sets"),
pattern = paste(
paste0("Supplementary Data Set ", c(1, 2, 5, 9), "\\."),
collapse = "|"
),
full.names = TRUE
),
FUN = function(x) {
out <- readxl::read_excel(path = x, skip = 1)
out[["file"]] <- gsub(" ", "_", gsub("Supplementary Data Set (.).(.*).xlsx", "SDS\\1_\\2", basename(x)))
if (!grepl("Chromosome", colnames(out)[1])) {
out <- cbind(
as.data.frame(
x = data.table::tstrsplit(out[[1]], ":|-|,"),
col.names = c("Chromosome (hg19)", "Start (hg19)", "End (hg19)"),
check.names = FALSE
),
out[, -1]
)
}
out
}
), fill = TRUE)[
j = `:=`(
"Islet regulatory element" =
data.table::fifelse(is.na(`Islet regulatory element`), "NA", `Islet regulatory element`),
"uid" = 1:.N,
"Start (hg19)" = as.integer(`Start (hg19)`),
"End (hg19)" = as.integer(`End (hg19)`)
)
]
colnames(tmp_annot_regions)[1:5] <- c("chr", "start", "end", "region", "file")
if (any(tolower(build) %in% c("grch38" , "hg38"))) {
output_file <- file.path(output_directory, "annot_regions_grch37.bed")
data.table::fwrite(
x = tmp_annot_regions[, c("chr", "start", "end", "region", "file", "uid")],
file = output_file,
sep = "\t",
col.names = FALSE,
quote = FALSE
)
system(
command = paste(
crossmap, "bed",
chain_file,
output_file,
gsub("_grch37.bed$", "_grch38.bed", output_file)
),
wait = TRUE
)
out <- merge(
x = data.table::fread(
file = gsub("_grch37.bed$", "_grch38.bed", output_file),
col.names = c("chr", "start", "end", "region", "file", "uid")
)[
j = list(start = min(start), end = max(end)),
by = c("uid", "region", "file", "chr")
],
y = tmp_annot_regions[, -c("chr", "start", "end")],
by = c("region", "file", "uid")
)[j = .SD, .SDcols = colnames(tmp_annot_regions)]
} else {
out <- tmp_annot_regions
}
out[
j = "genomic_region" := paste0(chr, ":", start, "-", end)
][
j = lapply(.SD, function(x) {
if (any(is.character(x))) {
data.table::fifelse(x == "" | is.na(x), NA_character_, x)
} else {
x
}
})
]
})
data <- data.table::as.data.table(data)[j = rowid := 1:.N]
out <- merge(
x = data[, -c("chr", "position")],
y = annot_regions[
data,
c(
rowid = unique(rowid),
lapply(.SD, function(x) paste(x, collapse = ";"))
),
on = list(chr, start <= position, end >= position),
.SDcols = setdiff(colnames(annot_regions), "rowid"),
mult = "all",
by = .EACHI
][j = -c(1:3)],
by = "rowid"
)[j = -c("rowid")]
out
}
@mcanouil
Copy link
Author

mcanouil commented Nov 2, 2020

options(width = 120)
library(readxl)
library(data.table)
devtools::source_gist("https://gist.github.com/mcanouil/a4c4c2f1182cdea69d48bd095f8ec119")
#> ℹ Sourcing https://gist.githubusercontent.com/mcanouil/a4c4c2f1182cdea69d48bd095f8ec119/raw/6a660818bd0fc2cb667ef79d71df8e2b87d7434a/annot_islets.R
#> ℹ SHA-1 hash of file is ae6366bad1f2d5084c96be077b4166b28c2e8177

eqtl_data <- structure(
  list(
    pair = c(
      "ENSG00000280120--rs1727313", "ENSG00000280120--rs4148856", "ENSG00000165325--rs57235767", 
      "ENSG00000280120--rs1060105","ENSG00000166002--rs57235767"
    ),
    ensembl_id = c("ENSG00000280120", "ENSG00000280120", "ENSG00000165325", "ENSG00000280120", "ENSG00000166002"),
    rs_id = c("rs1727313", "rs4148856", "rs57235767", "rs1060105", "rs57235767"),
    distance_bp = c(3981, -186107, -49607, 169347, -198108),
    chromosome_name = c(12, 12, 11, 12, 11),
    start_position = c(123152324, 123152324, 93329971, 123152324, 93478472),
    pvalue = c(0.01, 0.002, 0.10, 0.2, 0.04)
  ),
  row.names = c(NA, -5L),
  class = "data.frame"
)
eqtl_data[["chr"]] <- paste0("chr", eqtl_data[["chromosome_name"]])
eqtl_data[["position"]] <- eqtl_data[["start_position"]] + eqtl_data[["distance_bp"]] + 1
eqtl_data
#>                          pair      ensembl_id      rs_id distance_bp chromosome_name start_position pvalue   chr
#> 1  ENSG00000280120--rs1727313 ENSG00000280120  rs1727313        3981              12      123152324  0.010 chr12
#> 2  ENSG00000280120--rs4148856 ENSG00000280120  rs4148856     -186107              12      123152324  0.002 chr12
#> 3 ENSG00000165325--rs57235767 ENSG00000165325 rs57235767      -49607              11       93329971  0.100 chr11
#> 4  ENSG00000280120--rs1060105 ENSG00000280120  rs1060105      169347              12      123152324  0.200 chr12
#> 5 ENSG00000166002--rs57235767 ENSG00000166002 rs57235767     -198108              11       93478472  0.040 chr11
#>    position
#> 1 123156306
#> 2 122966218
#> 3  93280365
#> 4 123321672
#> 5  93280365

eqtl_annot <- annot_islets(data = eqtl_data, build = "GRCh37")
eqtl_annot[, 1:15]
#>                           pair      ensembl_id      rs_id distance_bp chromosome_name start_position pvalue         chr
#> 1:  ENSG00000280120--rs1727313 ENSG00000280120  rs1727313        3981              12      123152324  0.010          NA
#> 2:  ENSG00000280120--rs4148856 ENSG00000280120  rs4148856     -186107              12      123152324  0.002       chr12
#> 3: ENSG00000165325--rs57235767 ENSG00000165325 rs57235767      -49607              11       93329971  0.100 chr11;chr11
#> 4:  ENSG00000280120--rs1060105 ENSG00000280120  rs1060105      169347              12      123152324  0.200       chr12
#> 5: ENSG00000166002--rs57235767 ENSG00000166002 rs57235767     -198108              11       93478472  0.040 chr11;chr11
#>                start               end region
#> 1:                NA                NA     NA
#> 2:         122865183         122982155     NA
#> 3: 92663488;92974900 93400255;93400255  NA;NA
#> 4:         123248624         124104201     NA
#> 5: 92663488;92974900 93400255;93400255  NA;NA
#>                                                                               file All genes in bait
#> 1:                                                                              NA                NA
#> 2:                                         SDS5__List_of_human_islet_enhancer_hubs                NA
#> 3: SDS5__List_of_human_islet_enhancer_hubs;SDS5__List_of_human_islet_enhancer_hubs             NA;NA
#> 4:                                         SDS5__List_of_human_islet_enhancer_hubs                NA
#> 5: SDS5__List_of_human_islet_enhancer_hubs;SDS5__List_of_human_islet_enhancer_hubs             NA;NA
#>    Bait genes expressed in islets (TPM>1.5) Bait genes with active islet promoters
#> 1:                                       NA                                     NA
#> 2:                                       NA                                     NA
#> 3:                                    NA;NA                                  NA;NA
#> 4:                                       NA                                     NA
#> 5:                                    NA;NA                                  NA;NA

eqtl_annot_wide <- dcast(
  data = eqtl_annot[,
    lapply(.SD, function(x) unlist(tstrsplit(x, ";"))),
    by = c("pair", "pvalue"),
    .SDcols = "file"
  ][
    j = file := c(
      "NA" = "None",
      "SDS1__Islet_regulome" = "Regulome",
      "SDS2__Enhancer-promoter_assignments" = "Enhancer Promoter",
      "SDS5__List_of_human_islet_enhancer_hubs" = "Enhancer Hub",
      "SDS9__List_of_human_islet_super-enhancers_defined_using_ROSE_algorithm" = "Super Enhancer"
    )[file]
  ][j = inside_region := 1],
  formula = ... ~ file,
  value.var = "inside_region",
  fill = 0, 
  fun.aggregate = sum
)
eqtl_annot_wide
#>                           pair pvalue Enhancer Hub None
#> 1: ENSG00000165325--rs57235767  0.100            2    0
#> 2: ENSG00000166002--rs57235767  0.040            2    0
#> 3:  ENSG00000280120--rs1060105  0.200            1    0
#> 4:  ENSG00000280120--rs1727313  0.010            0    1
#> 5:  ENSG00000280120--rs4148856  0.002            1    0

eqtl_annot <- annot_islets(data = eqtl_data, build = "GRCh38")
#> Chain file has been downloaded from "http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz" to: "/tmp/RtmpTh26dD/hg19ToHg38.over.chain.gz"
eqtl_annot[, 1:15]
#>                           pair      ensembl_id      rs_id distance_bp chromosome_name start_position pvalue   chr
#> 1:  ENSG00000280120--rs1727313 ENSG00000280120  rs1727313        3981              12      123152324  0.010    NA
#> 2:  ENSG00000280120--rs4148856 ENSG00000280120  rs4148856     -186107              12      123152324  0.002 chr12
#> 3: ENSG00000165325--rs57235767 ENSG00000165325 rs57235767      -49607              11       93329971  0.100 chr11
#> 4:  ENSG00000280120--rs1060105 ENSG00000280120  rs1060105      169347              12      123152324  0.200    NA
#> 5: ENSG00000166002--rs57235767 ENSG00000166002 rs57235767     -198108              11       93478472  0.040 chr11
#>        start       end              region                 file All genes in bait
#> 1:        NA        NA                  NA                   NA                NA
#> 2: 122966023 122967009    Active_promoters SDS1__Islet_regulome                NA
#> 3:  93279423  93280520 Active_enhancers_II SDS1__Islet_regulome                NA
#> 4:        NA        NA                  NA                   NA                NA
#> 5:  93279423  93280520 Active_enhancers_II SDS1__Islet_regulome                NA
#>    Bait genes expressed in islets (TPM>1.5) Bait genes with active islet promoters
#> 1:                                       NA                                     NA
#> 2:                                       NA                                     NA
#> 3:                                       NA                                     NA
#> 4:                                       NA                                     NA
#> 5:                                       NA                                     NA

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment