mdozmorov/liftOver.R

## liftOver.R
# How to liftOver Paired BED data
library(dplyr)
library(tidyr)
library(readxl)
library(rtracklayer)

# Landscape of Cohesin-Mediated Chromatin Loops in the Human Genome
# https://www.nature.com/articles/s41586-020-2151-x
# Supplementary Table 4 | Pan-cell type cohesin-mediated chromatin loops, hg19 coordinates, paired BED data
url1 <- "https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-020-2151-x/MediaObjects/41586_2020_2151_MOESM5_ESM.xlsx"
fileNameIn1 <- "loops.xlsx" # Download Excel data
download.file(url1, fileNameIn1)
mtx <- read_xlsx(fileNameIn1, skip = 4) # Read it in
mtx <- mtx[, 1:6] # First 6 columns are coordinates
colnames(mtx) <- c("chrom1", "start1", "end1", "chrom2", "start2", "end2")

# Save as BEDPE
mtx.bedpe <- mtx %>% mutate(unite(as.data.frame(mtx[, 1:6]), col = "name", sep = ":"),
                            score = ".", strand1 = ".", strand2 = ".")
fileNameOut1 <- "mtx.bedpe"
write.table(as.data.frame(mtx.bedpe), fileNameOut1, sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE)

# hg19 to hg38 conversion chain
url2 <- "http://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz"
fileNameIn2 <- "hg19ToHg38.over.chain.gz"
download.file(url2, fileNameIn2)
system(paste0("gzip -d ", fileNameIn2))
fileNameIn2 <- "hg19ToHg38.over.chain"
ch <- import.chain(fileNameIn2)

# Attempt to liftOver Pairs object
# Import from file
read12 <- import(fileNameOut1, format = "bedpe")
mtx.bedpe.hg38 <- liftOver(read12, ch) # Fails

# Attempt to liftOver individual GRanges, then merge
# Select first read
read1 <- mtx[, 1:3]
colnames(read1) <- c("chr", "start", "end")
# Add metadata identifier column - combined coordinates of both reads
read1 <- read1 %>% mutate(unite(as.data.frame(mtx[, 1:6]), col = "name", sep = ":"))
# GRanges for the first read
GR.read1 <- makeGRangesFromDataFrame(read1)

GR.read1.hg38 <- liftOver(GR.read1, ch)
GR.read1.hg38 %>% unlist() # Metadata missing, can't merge

# Python2 version of liftOver individual reads, then merge
# https://github.com/dphansti/liftOverBedpe
# python liftOverBedpe.py --lift liftOver --chain hg19ToHg38.over.chain --i mtx.bedpe --o mtxhg38.bedpe
	# How to liftOver Paired BED data
	library(dplyr)
	library(tidyr)
	library(readxl)
	library(rtracklayer)

	# Landscape of Cohesin-Mediated Chromatin Loops in the Human Genome
	# https://www.nature.com/articles/s41586-020-2151-x
	# Supplementary Table 4 \| Pan-cell type cohesin-mediated chromatin loops, hg19 coordinates, paired BED data
	url1 <- "https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-020-2151-x/MediaObjects/41586_2020_2151_MOESM5_ESM.xlsx"
	fileNameIn1 <- "loops.xlsx" # Download Excel data
	download.file(url1, fileNameIn1)
	mtx <- read_xlsx(fileNameIn1, skip = 4) # Read it in
	mtx <- mtx[, 1:6] # First 6 columns are coordinates
	colnames(mtx) <- c("chrom1", "start1", "end1", "chrom2", "start2", "end2")

	# Save as BEDPE
	mtx.bedpe <- mtx %>% mutate(unite(as.data.frame(mtx[, 1:6]), col = "name", sep = ":"),
	score = ".", strand1 = ".", strand2 = ".")
	fileNameOut1 <- "mtx.bedpe"
	write.table(as.data.frame(mtx.bedpe), fileNameOut1, sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE)

	# hg19 to hg38 conversion chain
	url2 <- "http://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz"
	fileNameIn2 <- "hg19ToHg38.over.chain.gz"
	download.file(url2, fileNameIn2)
	system(paste0("gzip -d ", fileNameIn2))
	fileNameIn2 <- "hg19ToHg38.over.chain"
	ch <- import.chain(fileNameIn2)

	# Attempt to liftOver Pairs object
	# Import from file
	read12 <- import(fileNameOut1, format = "bedpe")
	mtx.bedpe.hg38 <- liftOver(read12, ch) # Fails

	# Attempt to liftOver individual GRanges, then merge
	# Select first read
	read1 <- mtx[, 1:3]
	colnames(read1) <- c("chr", "start", "end")
	# Add metadata identifier column - combined coordinates of both reads
	read1 <- read1 %>% mutate(unite(as.data.frame(mtx[, 1:6]), col = "name", sep = ":"))
	# GRanges for the first read
	GR.read1 <- makeGRangesFromDataFrame(read1)

	GR.read1.hg38 <- liftOver(GR.read1, ch)
	GR.read1.hg38 %>% unlist() # Metadata missing, can't merge

	# Python2 version of liftOver individual reads, then merge
	# https://github.com/dphansti/liftOverBedpe
	# python liftOverBedpe.py --lift liftOver --chain hg19ToHg38.over.chain --i mtx.bedpe --o mtxhg38.bedpe