dpryan79/BamToInsertsizeBigWig.py

## BamToInsertsizeBigWig.py
#!/usr/bin/env python
import deeptools.utilities
from deeptools import bamHandler
from deeptools import mapReduce
from deeptools import writeBedGraph
from deeptools import utilities
from deeptools.utilities import getCommonChrNames, toBytes

import sys
import shutil
import os
import numpy as np

BAM = "something.bam"  # This is a BAM file name
binLength = 1000  # Window size
stepSize = 50 # The step size by which each window is shifted
numberOfProcessors = 20  # Number of threads
outFileName = "something.bigWig"  # Ouput file name
outFileFormat = "bigwig"  # or "bedgraph"

def writeBedGraph_wrapper(args):
    """
    Passes the arguments to writeBedGraph_worker.
    This is a step required given
    the constrains from the multiprocessing module.
    The args var, contains as first element the 'self' value
    from the WriteBedGraph object

    """
    return foo.writeBedGraph_worker(*args)

class foo(writeBedGraph.WriteBedGraph):
    def calculate_mean_isize(self, bamHandle, chrom, regions):
        rv = np.zeros(len(regions), dtype='float64')
        for idx, reg in enumerate(regions):
            vals = [r.template_length for r in bamHandle.fetch(chrom, reg[0], reg[1]) if r.flag & 4 == 0]
            if len(vals) > 0:
                rv[idx] = np.mean(np.abs(vals))
        return rv

    def count_reads_in_region(self, chrom, start, end):
        bam_handlers = []
        for fname in self.bamFilesList:
            bam_handlers.append(bamHandler.openBam(fname))
        # A list of lists of tuples
        transcriptsToConsider = []
        for i in range(start, end, self.stepSize):
            if i + self.binLength > end:
                break
            transcriptsToConsider.append([(i, i + self.binLength)])

        subnum_reads_per_bin = []
        for bam in bam_handlers:
            for trans in transcriptsToConsider:
                tcov = self.calculate_mean_isize(bam, chrom, trans)
                subnum_reads_per_bin.extend(tcov)

        subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F')

        return subnum_reads_per_bin, ''

    def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args):
        coverage, _ = self.count_reads_in_region(chrom, start, end)

        _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
        previous_value = None
        line_string = "{}\t{}\t{}\t{:g}\n"
        for tileIndex in range(coverage.shape[0]):
            tileCoverage = coverage[tileIndex, :]
            value = func_to_call(tileCoverage, func_args)
            writeStart = start + tileIndex * self.stepSize
            writeEnd = min(writeStart + self.stepSize, end)
            _file.write(line_string.format(chrom, writeStart, writeEnd, value))

        tempfilename = _file.name
        _file.close()
        return tempfilename

    def run(self, func_to_call, func_args, out_file_name, format="bedgraph"):
        bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList]
        genome_chunk_length = writeBedGraph.getGenomeChunkLength(bam_handlers, self.binLength)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chrom_names_and_size, non_common = getCommonChrNames(bam_handlers, verbose=False)

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        for x in list(self.__dict__.keys()):
            sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x)))

        res = mapReduce.mapReduce([func_to_call, func_args],
                                  writeBedGraph_wrapper,
                                  chrom_names_and_size,
                                  self_=self,
                                  genomeChunkLength=genome_chunk_length,
                                  region=self.region,
                                  numberOfProcessors=self.numberOfProcessors)

        # concatenate intermediary bedgraph files
        out_file = open(out_file_name + ".bg", 'wb')
        for tempfilename in res:
            if tempfilename:
                # concatenate all intermediate tempfiles into one
                # bedgraph file
                _foo = open(tempfilename, 'rb')
                shutil.copyfileobj(_foo, out_file)
                _foo.close()
                os.remove(tempfilename)

        bedgraph_file = out_file.name
        out_file.close()
        if format == 'bedgraph':
            sort_cmd = cfg.config.get('external_tools', 'sort')
            os.system("LC_ALL=C {} -k1,1 -k2,2n {} > {}".format(sort_cmd, bedgraph_file, out_file_name))
            os.remove(bedgraph_file)
            if self.verbose:
                print("output file: {}".format(out_file_name))
        else:
            writeBedGraph.bedGraphToBigWig(
                chrom_names_and_size, bedgraph_file, out_file_name, True)
            if self.verbose:
                print("output file: {}".format(out_file_name))
            os.remove(bedgraph_file)


wr = foo([BAM], binLength=binLength, stepSize=stepSize, numberOfProcessors=numberOfProcessors, region="19")
func_args = {'scaleFactor': 1.0}
wr.run(writeBedGraph.scaleCoverage, func_args, outFileName, format=outFileFormat)
	#!/usr/bin/env python
	import deeptools.utilities
	from deeptools import bamHandler
	from deeptools import mapReduce
	from deeptools import writeBedGraph
	from deeptools import utilities
	from deeptools.utilities import getCommonChrNames, toBytes

	import sys
	import shutil
	import os
	import numpy as np

	BAM = "something.bam" # This is a BAM file name
	binLength = 1000 # Window size
	stepSize = 50 # The step size by which each window is shifted
	numberOfProcessors = 20 # Number of threads
	outFileName = "something.bigWig" # Ouput file name
	outFileFormat = "bigwig" # or "bedgraph"

	def writeBedGraph_wrapper(args):
	"""
	Passes the arguments to writeBedGraph_worker.
	This is a step required given
	the constrains from the multiprocessing module.
	The args var, contains as first element the 'self' value
	from the WriteBedGraph object

	"""
	return foo.writeBedGraph_worker(*args)

	class foo(writeBedGraph.WriteBedGraph):
	def calculate_mean_isize(self, bamHandle, chrom, regions):
	rv = np.zeros(len(regions), dtype='float64')
	for idx, reg in enumerate(regions):
	vals = [r.template_length for r in bamHandle.fetch(chrom, reg[0], reg[1]) if r.flag & 4 == 0]
	if len(vals) > 0:
	rv[idx] = np.mean(np.abs(vals))
	return rv

	def count_reads_in_region(self, chrom, start, end):
	bam_handlers = []
	for fname in self.bamFilesList:
	bam_handlers.append(bamHandler.openBam(fname))
	# A list of lists of tuples
	transcriptsToConsider = []
	for i in range(start, end, self.stepSize):
	if i + self.binLength > end:
	break
	transcriptsToConsider.append([(i, i + self.binLength)])

	subnum_reads_per_bin = []
	for bam in bam_handlers:
	for trans in transcriptsToConsider:
	tcov = self.calculate_mean_isize(bam, chrom, trans)
	subnum_reads_per_bin.extend(tcov)

	subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F')

	return subnum_reads_per_bin, ''

	def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args):
	coverage, _ = self.count_reads_in_region(chrom, start, end)

	_file = open(utilities.getTempFileName(suffix='.bg'), 'w')
	previous_value = None
	line_string = "{}\t{}\t{}\t{:g}\n"
	for tileIndex in range(coverage.shape[0]):
	tileCoverage = coverage[tileIndex, :]
	value = func_to_call(tileCoverage, func_args)
	writeStart = start + tileIndex * self.stepSize
	writeEnd = min(writeStart + self.stepSize, end)
	_file.write(line_string.format(chrom, writeStart, writeEnd, value))

	tempfilename = _file.name
	_file.close()
	return tempfilename

	def run(self, func_to_call, func_args, out_file_name, format="bedgraph"):
	bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList]
	genome_chunk_length = writeBedGraph.getGenomeChunkLength(bam_handlers, self.binLength)
	# check if both bam files correspond to the same species
	# by comparing the chromosome names:
	chrom_names_and_size, non_common = getCommonChrNames(bam_handlers, verbose=False)

	if self.region:
	# in case a region is used, append the tilesize
	self.region += ":{}".format(self.binLength)

	for x in list(self.__dict__.keys()):
	sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x)))

	res = mapReduce.mapReduce([func_to_call, func_args],
	writeBedGraph_wrapper,
	chrom_names_and_size,
	self_=self,
	genomeChunkLength=genome_chunk_length,
	region=self.region,
	numberOfProcessors=self.numberOfProcessors)

	# concatenate intermediary bedgraph files
	out_file = open(out_file_name + ".bg", 'wb')
	for tempfilename in res:
	if tempfilename:
	# concatenate all intermediate tempfiles into one
	# bedgraph file
	_foo = open(tempfilename, 'rb')
	shutil.copyfileobj(_foo, out_file)
	_foo.close()
	os.remove(tempfilename)

	bedgraph_file = out_file.name
	out_file.close()
	if format == 'bedgraph':
	sort_cmd = cfg.config.get('external_tools', 'sort')
	os.system("LC_ALL=C {} -k1,1 -k2,2n {} > {}".format(sort_cmd, bedgraph_file, out_file_name))
	os.remove(bedgraph_file)
	if self.verbose:
	print("output file: {}".format(out_file_name))
	else:
	writeBedGraph.bedGraphToBigWig(
	chrom_names_and_size, bedgraph_file, out_file_name, True)
	if self.verbose:
	print("output file: {}".format(out_file_name))
	os.remove(bedgraph_file)


	wr = foo([BAM], binLength=binLength, stepSize=stepSize, numberOfProcessors=numberOfProcessors, region="19")
	func_args = {'scaleFactor': 1.0}
	wr.run(writeBedGraph.scaleCoverage, func_args, outFileName, format=outFileFormat)