drakaki/multibed_intersect.R

## multibed_intersect.R
# Script for identifying the number of overlaps between any number of bed files. Given a set of N bed files,
# the script will find all 2-, 3- and 4-way unions (if N=4 for example, it will identify 11 unions), as well as
# the unique coordinates of each bed file.

# Load packages
library(ggplot2)
library(stringi)
library(stringr)
library(valr)
## valr package supports bedtools functions in R
library(dplyr)
library(combinat)
library(UpSetR)

# Pregame: List bed files you want to compare.
## For example, bed files could represent replicates of ChIP-seq peak files of a transcription factor.
setwd("/path/to/bed/dir/")

beds <- list.files(".")

## the following is an example of how to load your files into the global environment, while giving them a new name
for (i in beds){

  # the string name you want to isolate here in order to name your files will be different across files
  # so change it accordingly

  filename <- str_split_i(i, pattern = "\\.", 1)
  # use valr functions to load files
  assign(filename, read_bed(i))

}

# add all bed files in a list object
## bed_list <- list(...)
# name the list objects in a concise way
## names(bed_list) <- c(...)
### for example: names(bed_list) <- c("rep1", "rep2", "rep3", "rep4")

# Step 1: use combinat library to generate all possible intersection names given your set of bed files
# to understand what that looks like run:
# unlist(lapply(2:length(bed_list), function(x) combn(names(bed_list), x, simplify = FALSE)), recursive = FALSE)

find_ovls <- function(bed_list){

  ovls <- list()

  # Generate all combinations of file names
  all_combinations <- unlist(lapply(2:length(bed_list), function(x) combn(names(bed_list), x, simplify = FALSE)), recursive = FALSE)

  # Perform intersections for all combinations
  for (comb in all_combinations){

    comb_name <- paste(comb, collapse = "_")

    ovls[[comb_name]] <- bed_list[[comb[[1]]]]

    for (i in 2:length(comb)){

      if (ncol(ovls[[comb_name]])<=3) {

        ovls[[comb_name]] <- bed_intersect(ovls[[comb_name]], bed_list[[comb[[i]]]])

      }

      else {
        # for 3-way intersections or above, we have to modify the df that has been outputed in the previous if statement
        # since it is a product of bed_intersect, it has more than 3 columns with names like so: chrom, start.x, start.y
        # and so on
        # if you were to use it as so, it would give out an error since the function expects the first three columns
        # to be chrom, start, end
        ovls[[comb_name]] <- ovls[[comb_name]] %>%
          # here you find the start and end of each overlapping interval (basically producing new coordinates)
            dplyr::mutate(start = pmax(start.x, start.y), end = pmin(end.x, end.y)) %>%
            select(chrom, start, end)

        ovls[[comb_name]] <- bed_intersect(ovls[[comb_name]], bed_list[[comb[[i]]]])
      }
    }
  }

  return(ovls)
}

ovls <- find_ovls(bed_list)

# Step 2: Identify unique peaks for each bed file

unique <- lapply(names(bed_list), function(x) {
  y = bed_intersect(bed_list[[x]], bed_list[!(names(bed_list) == x)], invert=T)
  return(y)
})

names(unique) <- names(bed_list)

# Create UpSet plot

counts <- unlist(lapply(c(bed_list, ovls), nrow))

names(counts) <- gsub("_", "&", names(counts))

upset(fromExpression(counts),
      order.by = "freq",
      decreasing = TRUE,
      sets.bar.color = "#56B4E9",
      mb.ratio = c(0.6, 0.4),
      number.angles = 0,
      text.scale = 2,
      point.size = 2.8,
      line.size = 1)
	# Script for identifying the number of overlaps between any number of bed files. Given a set of N bed files,
	# the script will find all 2-, 3- and 4-way unions (if N=4 for example, it will identify 11 unions), as well as
	# the unique coordinates of each bed file.

	# Load packages
	library(ggplot2)
	library(stringi)
	library(stringr)
	library(valr)
	## valr package supports bedtools functions in R
	library(dplyr)
	library(combinat)
	library(UpSetR)

	# Pregame: List bed files you want to compare.
	## For example, bed files could represent replicates of ChIP-seq peak files of a transcription factor.
	setwd("/path/to/bed/dir/")

	beds <- list.files(".")

	## the following is an example of how to load your files into the global environment, while giving them a new name
	for (i in beds){

	# the string name you want to isolate here in order to name your files will be different across files
	# so change it accordingly

	filename <- str_split_i(i, pattern = "\\.", 1)
	# use valr functions to load files
	assign(filename, read_bed(i))

	}

	# add all bed files in a list object
	## bed_list <- list(...)
	# name the list objects in a concise way
	## names(bed_list) <- c(...)
	### for example: names(bed_list) <- c("rep1", "rep2", "rep3", "rep4")

	# Step 1: use combinat library to generate all possible intersection names given your set of bed files
	# to understand what that looks like run:
	# unlist(lapply(2:length(bed_list), function(x) combn(names(bed_list), x, simplify = FALSE)), recursive = FALSE)

	find_ovls <- function(bed_list){

	ovls <- list()

	# Generate all combinations of file names
	all_combinations <- unlist(lapply(2:length(bed_list), function(x) combn(names(bed_list), x, simplify = FALSE)), recursive = FALSE)

	# Perform intersections for all combinations
	for (comb in all_combinations){

	comb_name <- paste(comb, collapse = "_")

	ovls[[comb_name]] <- bed_list[[comb[[1]]]]

	for (i in 2:length(comb)){

	if (ncol(ovls[[comb_name]])<=3) {

	ovls[[comb_name]] <- bed_intersect(ovls[[comb_name]], bed_list[[comb[[i]]]])

	}

	else {
	# for 3-way intersections or above, we have to modify the df that has been outputed in the previous if statement
	# since it is a product of bed_intersect, it has more than 3 columns with names like so: chrom, start.x, start.y
	# and so on
	# if you were to use it as so, it would give out an error since the function expects the first three columns
	# to be chrom, start, end
	ovls[[comb_name]] <- ovls[[comb_name]] %>%
	# here you find the start and end of each overlapping interval (basically producing new coordinates)
	dplyr::mutate(start = pmax(start.x, start.y), end = pmin(end.x, end.y)) %>%
	select(chrom, start, end)

	ovls[[comb_name]] <- bed_intersect(ovls[[comb_name]], bed_list[[comb[[i]]]])
	}
	}
	}

	return(ovls)
	}

	ovls <- find_ovls(bed_list)

	# Step 2: Identify unique peaks for each bed file

	unique <- lapply(names(bed_list), function(x) {
	y = bed_intersect(bed_list[[x]], bed_list[!(names(bed_list) == x)], invert=T)
	return(y)
	})

	names(unique) <- names(bed_list)

	# Create UpSet plot

	counts <- unlist(lapply(c(bed_list, ovls), nrow))

	names(counts) <- gsub("_", "&", names(counts))

	upset(fromExpression(counts),
	order.by = "freq",
	decreasing = TRUE,
	sets.bar.color = "#56B4E9",
	mb.ratio = c(0.6, 0.4),
	number.angles = 0,
	text.scale = 2,
	point.size = 2.8,
	line.size = 1)