Skip to content

Instantly share code, notes, and snippets.

@drakaki
Created May 24, 2024 13:31
Show Gist options
  • Save drakaki/9349469349d4d1666d312cd970def77f to your computer and use it in GitHub Desktop.
Save drakaki/9349469349d4d1666d312cd970def77f to your computer and use it in GitHub Desktop.
Script to find intersections across multiple bed files and create an UpSet plot illustrating the number of regions shared between them.
# Script for identifying the number of overlaps between any number of bed files. Given a set of N bed files,
# the script will find all 2-, 3- and 4-way unions (if N=4 for example, it will identify 11 unions), as well as
# the unique coordinates of each bed file.
# Load packages
library(ggplot2)
library(stringi)
library(stringr)
library(valr)
## valr package supports bedtools functions in R
library(dplyr)
library(combinat)
library(UpSetR)
# Pregame: List bed files you want to compare.
## For example, bed files could represent replicates of ChIP-seq peak files of a transcription factor.
setwd("/path/to/bed/dir/")
beds <- list.files(".")
## the following is an example of how to load your files into the global environment, while giving them a new name
for (i in beds){
# the string name you want to isolate here in order to name your files will be different across files
# so change it accordingly
filename <- str_split_i(i, pattern = "\\.", 1)
# use valr functions to load files
assign(filename, read_bed(i))
}
# add all bed files in a list object
## bed_list <- list(...)
# name the list objects in a concise way
## names(bed_list) <- c(...)
### for example: names(bed_list) <- c("rep1", "rep2", "rep3", "rep4")
# Step 1: use combinat library to generate all possible intersection names given your set of bed files
# to understand what that looks like run:
# unlist(lapply(2:length(bed_list), function(x) combn(names(bed_list), x, simplify = FALSE)), recursive = FALSE)
find_ovls <- function(bed_list){
ovls <- list()
# Generate all combinations of file names
all_combinations <- unlist(lapply(2:length(bed_list), function(x) combn(names(bed_list), x, simplify = FALSE)), recursive = FALSE)
# Perform intersections for all combinations
for (comb in all_combinations){
comb_name <- paste(comb, collapse = "_")
ovls[[comb_name]] <- bed_list[[comb[[1]]]]
for (i in 2:length(comb)){
if (ncol(ovls[[comb_name]])<=3) {
ovls[[comb_name]] <- bed_intersect(ovls[[comb_name]], bed_list[[comb[[i]]]])
}
else {
# for 3-way intersections or above, we have to modify the df that has been outputed in the previous if statement
# since it is a product of bed_intersect, it has more than 3 columns with names like so: chrom, start.x, start.y
# and so on
# if you were to use it as so, it would give out an error since the function expects the first three columns
# to be chrom, start, end
ovls[[comb_name]] <- ovls[[comb_name]] %>%
# here you find the start and end of each overlapping interval (basically producing new coordinates)
dplyr::mutate(start = pmax(start.x, start.y), end = pmin(end.x, end.y)) %>%
select(chrom, start, end)
ovls[[comb_name]] <- bed_intersect(ovls[[comb_name]], bed_list[[comb[[i]]]])
}
}
}
return(ovls)
}
ovls <- find_ovls(bed_list)
# Step 2: Identify unique peaks for each bed file
unique <- lapply(names(bed_list), function(x) {
y = bed_intersect(bed_list[[x]], bed_list[!(names(bed_list) == x)], invert=T)
return(y)
})
names(unique) <- names(bed_list)
# Create UpSet plot
counts <- unlist(lapply(c(bed_list, ovls), nrow))
names(counts) <- gsub("_", "&", names(counts))
upset(fromExpression(counts),
order.by = "freq",
decreasing = TRUE,
sets.bar.color = "#56B4E9",
mb.ratio = c(0.6, 0.4),
number.angles = 0,
text.scale = 2,
point.size = 2.8,
line.size = 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment