SamBuckberry/bias_correct_footprints_BETA.R

## bias_correct_footprints_BETA.R
### Get Tn5 bias track for regions

bias_bigwig <- "~/Desktop/mm10_bias_chr19.Scores.bigwig"
atac_regions <- "~/polo_iPSC/ATACseq/processed_data/atac_peaks/atac_all_peaks_union.bed"
atac_bam <- "~/Desktop/atac_iPSC_combined_replicates_chr19.bam"
ctcf <- read.table("~/R_packages/RunATAC/inst/exdata/ctcf.pwm") %>% as.matrix()

#' Read BED formatted file to GRanges object
#'
#' @param bed_file Path Path to BED formatted file.
#' @return A GRanges object
#' @examples
#' \dontrun{
#' import_bed("myBedFile.bed")
#' }
#' @note Only imports the first 3 columns of a bed file.
#' @export
read_bed <- function(bed_file){
        dat <- data.table::fread(bed_file, sep = "\t", header = FALSE)
        gr <- GenomicRanges::GRanges(seqnames = dat$V1,
                                     ranges = IRanges(start = dat$V2,
                                                      end = dat$V3))
        return(gr)
}


#' Read BAM file Tn5 insertion positions into GRanges object
#'
#' @param bam_file Path to a BAM formatted file.
#' @param yieldSize The number of records from the BAM file to read in each chunk. See ?BamFile.
#' @param ... Arguments passed to ScanBamParam.
#' @return A GRanges object of Tn5 insertion positions
#' @examples
#' \dontrun{
#' tn <- import_atac_pos("test_100k.bam")
#' }
#' @export
read_atac_bam_tn <- function(bam_file, yieldSize=1e6, ...){

        # Allow the setting of parameters for Bam file scan such as Which regions
        param <- Rsamtools::ScanBamParam(...)

        message("Processing BAM file. This this may take a while for large files...")
        message("If you have lots of RAM, increase yieldSize to speed things up.")

        # Loop for reading the BAM file in chunks
        bf <- Rsamtools::BamFile(file = bam_file, yieldSize = yieldSize)
        open(bf)
        gr <- NULL
        repeat {
                chunk <- GenomicAlignments::readGAlignments(bf, param = param)
                if (length(chunk) == 0L)
                        break
                chunk_gr <- GenomicRanges::GRanges(chunk)
                if (is.null(gr)) {
                        gr <- chunk_gr
                } else {
                        gr <- c(gr, chunk_gr)
                }
        }
        close(bf)

        gr <- GRangesList(gr) %>% unlist()

        message("Offsetting alignments to Tn5 integration site.")

        # Offset the reads to correspond to tn5 insertion site
        pos <- gr[strand(gr) == "+"] %>%
                GenomicRanges::shift(shift=4) %>%
                GenomicRanges::resize(width = 2, fix = "start")

        neg <- gr[strand(gr) == "-"] %>%
                GenomicRanges::shift(shift = -5) %>%
                GenomicRanges::resize(width = 2, fix = "start")

        # Return the pos and neg strands together
        gr <- c(pos, neg)

        message("Done!")

        return(gr)
}


#' Get the positions of the PWM in ATAC-seq peaks.
#'
#' @param gr A GRanges object. Usually ranges of ATAC-seq peaks.
#' @param pwm A positional weight matrix of class Matirx.
#' @param genome A BSgenome object.
#' @param min.score Minimum alignment score for pwm. Default = "85%".
#' @return A GRanges object with positions of PWM match in peaks.
#' @export
#' @importFrom magrittr %>%
#' @import GenomicRanges
#' @import IRanges
#' @examples
#' \dontrun{
#' library(BSgenome.Mmusculus.UCSC.mm10)
#' ctcf <- read.table("/data/ctcf.pwm") %>% as.matrix()
#' peaks <- read_bed("/data/atac_peaks.bed")
#' mpos <- motif_gr(peaks, ctcf, Mmusculus)
#' }
motif_gr <- function(gr, pwm, genome=Mmusculus, min.score="85%"){

        if(class(gr) != "GRanges"){
                stop("Object gr is not of class GRanges.")
        }
        if(class(pwm) != "matrix"){
                stop("Object pwm is not of class matrix.")
        }
        if(class(genome) != "BSgenome"){
                stop("Object genome is not of class BSgenome.")
        }
        if(class(min.score) != "character"){
                stop("Object genome is not of class character.")
        }

        # Get the nucleotide sequences
        sequences <- BSgenome::getSeq(x = genome, names=gr)

        # Function to find motif in one sequence
        find_motif_start <- function(x)
        {
                motif_starts <- Biostrings::matchPWM(pwm = pwm, subject = sequences[[x]],
                                                     min.score = min.score) %>% start()
                starts <- start(gr[x]) + motif_starts
                if (length(starts) == 0){
                        out <- NULL
                } else {
                        ends <- starts + ncol(pwm)
                        out <- GenomicRanges::GRanges(seqnames = seqnames(gr[x]),
                                                      ranges = IRanges(start = starts,
                                                                       end = ends))
                }

                return(out)
        }

        # Apply the function across all sequences
        motif_ranges <- lapply(X = 1:length(sequences), FUN = find_motif_start)

        # Remove the NULLs from where no motif was detected
        is.NullOb <- function(x) is.null(motif_ranges[x]) | all(sapply(motif_ranges[x], is.null))
        no_keep <- lapply(1:length(motif_ranges), is.NullOb) %>% unlist()

        # Convert to Granges object
        motif_ranges <- motif_ranges[!no_keep] %>% unlist() %>% GRangesList() %>% unlist()

        # Decrease the width to motif centre
        motif_ranges <- resize(motif_ranges, width = 2, fix = 'center')

        # Return GRanges object
        return(motif_ranges)
}


atac_peaks <- read_bed(atac_regions)
atac_peaks <- atac_peaks[seqnames(atac_peaks) == "chr19"]

motif_pos_gr <- motif_gr(gr = atac_peaks, pwm = ctcf)

tn_signal_gr <- rtracklayer::import("~/Desktop/nucleoatac_iPSC.ins.bigwig",
                                    format="BigWig", which=motif_pos_gr)


averagePerBin <- function(x, binsize, mcolnames=NULL)
{
        if (!is(x, "GenomicRanges"))
                stop("'x' must be a GenomicRanges object")
        if (any(is.na(seqlengths(x))))
                stop("'seqlengths(x)' contains NAs")
        bins <- IRangesList(lapply(seqlengths(x),
                                   function(seqlen)
                                           IRanges(breakInChunks(seqlen, binsize))))
        ans <- as(bins, "GRanges")
        seqinfo(ans) <- seqinfo(x)
        if (is.null(mcolnames))
                return(ans)
        averageMCol <- function(colname)
        {
                cvg <- coverage(x, weight=colname)
                views_list <- RleViewsList(
                        lapply(names(cvg),
                               function(seqname)
                                       Views(cvg[[seqname]], bins[[seqname]])))
                unlist(viewMeans(views_list), use.names=FALSE)
        }
        mcols(ans) <- DataFrame(lapply(mcols(x)[mcolnames], averageMCol))
        ans
}


bias_correct_atac_motif <- function(tn_signal_gr, motif_pos_gr, bias_bigwig, range=100){

        # Resize to range of signal collection
        motif_pos_gr <- GenomicRanges::resize(motif_pos_gr, width = range*2,
                                              fix = 'center')

        # Compute bias signal -----------------------------------------------------

        # Get the bias signal for regions
        bias <- rtracklayer::import(con = bias_bigwig, format = "Bigwig",
                                    which=motif_pos_gr)

        get_bias_scores <- function(x){

                # Subset to one region
                motif_pos <- motif_pos_gr[x]

                # Get the bias calculation for range
                bias_sub <- bias[IRanges::overlapsAny(query = bias,
                                                      subject = motif_pos)]

                # Calculate the relative distance to the motif centre for insertions
                bias_dist <- (start(motif_pos) + range) - start(bias_sub)

                # Get bias scores for relative position
                df <- data.frame(rel_pos=bias_dist, bias_score=bias_sub$score)

                return(df)

        }

        message("Calculating Tn5 insertion bias scores...")
        bias_scores <- lapply(1:length(motif_pos_gr), get_bias_scores)
        df <- join_all(bias_scores, by = 'rel_pos', type = 'left')

        # Compute Tn5 signal -----------------------------------------------------

        get_insertion_scores <- function(x){

                # Subset to one region
                motif_pos <- motif_pos_gr[x]

                # Get the bias calculation for range
                tn_score <- tn_signal_gr[IRanges::overlapsAny(query = tn_signal_gr,
                                                      subject = motif_pos)]

                test <- averagePerBin(tn_score, binsize = 1, mcolnames = "score")

                # Calculate the relative distance to the motif centre for insertions
                tn_dist <- (start(motif_pos) + range) - start(tn_score)

                # Get bias scores for relative position
                df <- data.frame(rel_pos=bias_dist, bias_score=bias_sub$score)

                return(df)

        }


        # Subset the Tn5 signal
        tn_signal_gr <- tn_signal_gr[IRanges::overlapsAny(query = tn_signal_gr,
                                                          subject = motif_pos_gr)]

        #For each motif region, get the distances of nearby Tn5 insertions
        get_tn_dist <- function(x){

                # Subset to one region
                motif_pos <- motif_pos_gr[x]

                # Get the tn5 hits for range
                tn_sub <- tn_signal_gr[IRanges::overlapsAny(query = tn_signal_gr,
                                                            subject = motif_pos)]

                test <- coverage(tn_sub)

                # Calculate the relative distance to the motif centre for insertions
                tn_dist <- (start(motif_pos) + range) - start(tn_sub)

                # Check if no Tn5 insertion events at position
                if(length(tn_dist) < 1){
                        tn_dist <- NA
                }

                return(tn_dist)
        }

        # Apply the function over all motif positions
        dists <- lapply(1:length(motif_pos_gr), get_tn_dist)
        dists <- dists[!is.na(dists)] %>% unlist()

        # Calculate the histogram
        hist_dists <- table(dists) %>% data.frame()

        colnames(hist_dists) <- c("rel_pos", "insertions")

        # Merge insertion frequency with bias
        results <- merge.data.frame(bias_mean, hist_dists,
                                    by = "rel_pos", all = TRUE)

        results$rel_pos <- as.character(results$rel_pos) %>% as.numeric()
        results$bias_mean <- as.character(results$bias_mean) %>% as.numeric()
        results$insertions <- as.character(results$insertions) %>% as.numeric()

        # sort the results
        results <- results[order(results$rel_pos, decreasing = FALSE), ]

        # scale the insertions from 0-1
        range01 <- function(x, ...){(x - min(x, ...)) / (max(x, ...) - min(x, ...))}
        results$insertions_scaled <- range01(result$insertions, na.rm = TRUE)

        # Return results
        return(results)

}
	### Get Tn5 bias track for regions

	bias_bigwig <- "~/Desktop/mm10_bias_chr19.Scores.bigwig"
	atac_regions <- "~/polo_iPSC/ATACseq/processed_data/atac_peaks/atac_all_peaks_union.bed"
	atac_bam <- "~/Desktop/atac_iPSC_combined_replicates_chr19.bam"
	ctcf <- read.table("~/R_packages/RunATAC/inst/exdata/ctcf.pwm") %>% as.matrix()

	#' Read BED formatted file to GRanges object
	#'
	#' @param bed_file Path Path to BED formatted file.
	#' @return A GRanges object
	#' @examples
	#' \dontrun{
	#' import_bed("myBedFile.bed")
	#' }
	#' @note Only imports the first 3 columns of a bed file.
	#' @export
	read_bed <- function(bed_file){
	dat <- data.table::fread(bed_file, sep = "\t", header = FALSE)
	gr <- GenomicRanges::GRanges(seqnames = dat$V1,
	ranges = IRanges(start = dat$V2,
	end = dat$V3))
	return(gr)
	}




	#' Read BAM file Tn5 insertion positions into GRanges object
	#'
	#' @param bam_file Path to a BAM formatted file.
	#' @param yieldSize The number of records from the BAM file to read in each chunk. See ?BamFile.
	#' @param ... Arguments passed to ScanBamParam.
	#' @return A GRanges object of Tn5 insertion positions
	#' @examples
	#' \dontrun{
	#' tn <- import_atac_pos("test_100k.bam")
	#' }
	#' @export
	read_atac_bam_tn <- function(bam_file, yieldSize=1e6, ...){

	# Allow the setting of parameters for Bam file scan such as Which regions
	param <- Rsamtools::ScanBamParam(...)

	message("Processing BAM file. This this may take a while for large files...")
	message("If you have lots of RAM, increase yieldSize to speed things up.")

	# Loop for reading the BAM file in chunks
	bf <- Rsamtools::BamFile(file = bam_file, yieldSize = yieldSize)
	open(bf)
	gr <- NULL
	repeat {
	chunk <- GenomicAlignments::readGAlignments(bf, param = param)
	if (length(chunk) == 0L)
	break
	chunk_gr <- GenomicRanges::GRanges(chunk)
	if (is.null(gr)) {
	gr <- chunk_gr
	} else {
	gr <- c(gr, chunk_gr)
	}
	}
	close(bf)

	gr <- GRangesList(gr) %>% unlist()

	message("Offsetting alignments to Tn5 integration site.")

	# Offset the reads to correspond to tn5 insertion site
	pos <- gr[strand(gr) == "+"] %>%
	GenomicRanges::shift(shift=4) %>%
	GenomicRanges::resize(width = 2, fix = "start")

	neg <- gr[strand(gr) == "-"] %>%
	GenomicRanges::shift(shift = -5) %>%
	GenomicRanges::resize(width = 2, fix = "start")

	# Return the pos and neg strands together
	gr <- c(pos, neg)

	message("Done!")

	return(gr)
	}


	#' Get the positions of the PWM in ATAC-seq peaks.
	#'
	#' @param gr A GRanges object. Usually ranges of ATAC-seq peaks.
	#' @param pwm A positional weight matrix of class Matirx.
	#' @param genome A BSgenome object.
	#' @param min.score Minimum alignment score for pwm. Default = "85%".
	#' @return A GRanges object with positions of PWM match in peaks.
	#' @export
	#' @importFrom magrittr %>%
	#' @import GenomicRanges
	#' @import IRanges
	#' @examples
	#' \dontrun{
	#' library(BSgenome.Mmusculus.UCSC.mm10)
	#' ctcf <- read.table("/data/ctcf.pwm") %>% as.matrix()
	#' peaks <- read_bed("/data/atac_peaks.bed")
	#' mpos <- motif_gr(peaks, ctcf, Mmusculus)
	#' }
	motif_gr <- function(gr, pwm, genome=Mmusculus, min.score="85%"){

	if(class(gr) != "GRanges"){
	stop("Object gr is not of class GRanges.")
	}
	if(class(pwm) != "matrix"){
	stop("Object pwm is not of class matrix.")
	}
	if(class(genome) != "BSgenome"){
	stop("Object genome is not of class BSgenome.")
	}
	if(class(min.score) != "character"){
	stop("Object genome is not of class character.")
	}

	# Get the nucleotide sequences
	sequences <- BSgenome::getSeq(x = genome, names=gr)

	# Function to find motif in one sequence
	find_motif_start <- function(x)
	{
	motif_starts <- Biostrings::matchPWM(pwm = pwm, subject = sequences[[x]],
	min.score = min.score) %>% start()
	starts <- start(gr[x]) + motif_starts
	if (length(starts) == 0){
	out <- NULL
	} else {
	ends <- starts + ncol(pwm)
	out <- GenomicRanges::GRanges(seqnames = seqnames(gr[x]),
	ranges = IRanges(start = starts,
	end = ends))
	}

	return(out)
	}

	# Apply the function across all sequences
	motif_ranges <- lapply(X = 1:length(sequences), FUN = find_motif_start)

	# Remove the NULLs from where no motif was detected
	is.NullOb <- function(x) is.null(motif_ranges[x]) \| all(sapply(motif_ranges[x], is.null))
	no_keep <- lapply(1:length(motif_ranges), is.NullOb) %>% unlist()

	# Convert to Granges object
	motif_ranges <- motif_ranges[!no_keep] %>% unlist() %>% GRangesList() %>% unlist()

	# Decrease the width to motif centre
	motif_ranges <- resize(motif_ranges, width = 2, fix = 'center')

	# Return GRanges object
	return(motif_ranges)
	}



	atac_peaks <- read_bed(atac_regions)
	atac_peaks <- atac_peaks[seqnames(atac_peaks) == "chr19"]

	motif_pos_gr <- motif_gr(gr = atac_peaks, pwm = ctcf)

	tn_signal_gr <- rtracklayer::import("~/Desktop/nucleoatac_iPSC.ins.bigwig",
	format="BigWig", which=motif_pos_gr)



	averagePerBin <- function(x, binsize, mcolnames=NULL)
	{
	if (!is(x, "GenomicRanges"))
	stop("'x' must be a GenomicRanges object")
	if (any(is.na(seqlengths(x))))
	stop("'seqlengths(x)' contains NAs")
	bins <- IRangesList(lapply(seqlengths(x),
	function(seqlen)
	IRanges(breakInChunks(seqlen, binsize))))
	ans <- as(bins, "GRanges")
	seqinfo(ans) <- seqinfo(x)
	if (is.null(mcolnames))
	return(ans)
	averageMCol <- function(colname)
	{
	cvg <- coverage(x, weight=colname)
	views_list <- RleViewsList(
	lapply(names(cvg),
	function(seqname)
	Views(cvg[[seqname]], bins[[seqname]])))
	unlist(viewMeans(views_list), use.names=FALSE)
	}
	mcols(ans) <- DataFrame(lapply(mcols(x)[mcolnames], averageMCol))
	ans
	}




	bias_correct_atac_motif <- function(tn_signal_gr, motif_pos_gr, bias_bigwig, range=100){

	# Resize to range of signal collection
	motif_pos_gr <- GenomicRanges::resize(motif_pos_gr, width = range*2,
	fix = 'center')

	# Compute bias signal -----------------------------------------------------

	# Get the bias signal for regions
	bias <- rtracklayer::import(con = bias_bigwig, format = "Bigwig",
	which=motif_pos_gr)

	get_bias_scores <- function(x){

	# Subset to one region
	motif_pos <- motif_pos_gr[x]

	# Get the bias calculation for range
	bias_sub <- bias[IRanges::overlapsAny(query = bias,
	subject = motif_pos)]

	# Calculate the relative distance to the motif centre for insertions
	bias_dist <- (start(motif_pos) + range) - start(bias_sub)

	# Get bias scores for relative position
	df <- data.frame(rel_pos=bias_dist, bias_score=bias_sub$score)

	return(df)

	}

	message("Calculating Tn5 insertion bias scores...")
	bias_scores <- lapply(1:length(motif_pos_gr), get_bias_scores)
	df <- join_all(bias_scores, by = 'rel_pos', type = 'left')

	# Compute Tn5 signal -----------------------------------------------------

	get_insertion_scores <- function(x){

	# Subset to one region
	motif_pos <- motif_pos_gr[x]

	# Get the bias calculation for range
	tn_score <- tn_signal_gr[IRanges::overlapsAny(query = tn_signal_gr,
	subject = motif_pos)]

	test <- averagePerBin(tn_score, binsize = 1, mcolnames = "score")

	# Calculate the relative distance to the motif centre for insertions
	tn_dist <- (start(motif_pos) + range) - start(tn_score)

	# Get bias scores for relative position
	df <- data.frame(rel_pos=bias_dist, bias_score=bias_sub$score)

	return(df)

	}








	# Subset the Tn5 signal
	tn_signal_gr <- tn_signal_gr[IRanges::overlapsAny(query = tn_signal_gr,
	subject = motif_pos_gr)]

	#For each motif region, get the distances of nearby Tn5 insertions
	get_tn_dist <- function(x){

	# Subset to one region
	motif_pos <- motif_pos_gr[x]

	# Get the tn5 hits for range
	tn_sub <- tn_signal_gr[IRanges::overlapsAny(query = tn_signal_gr,
	subject = motif_pos)]

	test <- coverage(tn_sub)

	# Calculate the relative distance to the motif centre for insertions
	tn_dist <- (start(motif_pos) + range) - start(tn_sub)

	# Check if no Tn5 insertion events at position
	if(length(tn_dist) < 1){
	tn_dist <- NA
	}

	return(tn_dist)
	}

	# Apply the function over all motif positions
	dists <- lapply(1:length(motif_pos_gr), get_tn_dist)
	dists <- dists[!is.na(dists)] %>% unlist()

	# Calculate the histogram
	hist_dists <- table(dists) %>% data.frame()

	colnames(hist_dists) <- c("rel_pos", "insertions")

	# Merge insertion frequency with bias
	results <- merge.data.frame(bias_mean, hist_dists,
	by = "rel_pos", all = TRUE)

	results$rel_pos <- as.character(results$rel_pos) %>% as.numeric()
	results$bias_mean <- as.character(results$bias_mean) %>% as.numeric()
	results$insertions <- as.character(results$insertions) %>% as.numeric()

	# sort the results
	results <- results[order(results$rel_pos, decreasing = FALSE), ]

	# scale the insertions from 0-1
	range01 <- function(x, ...){(x - min(x, ...)) / (max(x, ...) - min(x, ...))}
	results$insertions_scaled <- range01(result$insertions, na.rm = TRUE)

	# Return results
	return(results)

	}