Skip to content

Instantly share code, notes, and snippets.

@dpryan79
Last active April 24, 2017 11:35
Show Gist options
  • Save dpryan79/fbbdd4a6f897ccecec69239782256876 to your computer and use it in GitHub Desktop.
Save dpryan79/fbbdd4a6f897ccecec69239782256876 to your computer and use it in GitHub Desktop.
Convert a BAM file to a bigWig or bedGraph file containing the mean insert size (uses the deepTools API)
#!/usr/bin/env python
import deeptools.utilities
from deeptools import bamHandler
from deeptools import mapReduce
from deeptools import writeBedGraph
from deeptools import utilities
from deeptools.utilities import getCommonChrNames, toBytes
import sys
import shutil
import os
import numpy as np
BAM = "something.bam" # This is a BAM file name
binLength = 1000 # Window size
stepSize = 50 # The step size by which each window is shifted
numberOfProcessors = 20 # Number of threads
outFileName = "something.bigWig" # Ouput file name
outFileFormat = "bigwig" # or "bedgraph"
def writeBedGraph_wrapper(args):
"""
Passes the arguments to writeBedGraph_worker.
This is a step required given
the constrains from the multiprocessing module.
The args var, contains as first element the 'self' value
from the WriteBedGraph object
"""
return foo.writeBedGraph_worker(*args)
class foo(writeBedGraph.WriteBedGraph):
def calculate_mean_isize(self, bamHandle, chrom, regions):
rv = np.zeros(len(regions), dtype='float64')
for idx, reg in enumerate(regions):
vals = [r.template_length for r in bamHandle.fetch(chrom, reg[0], reg[1]) if r.flag & 4 == 0]
if len(vals) > 0:
rv[idx] = np.mean(np.abs(vals))
return rv
def count_reads_in_region(self, chrom, start, end):
bam_handlers = []
for fname in self.bamFilesList:
bam_handlers.append(bamHandler.openBam(fname))
# A list of lists of tuples
transcriptsToConsider = []
for i in range(start, end, self.stepSize):
if i + self.binLength > end:
break
transcriptsToConsider.append([(i, i + self.binLength)])
subnum_reads_per_bin = []
for bam in bam_handlers:
for trans in transcriptsToConsider:
tcov = self.calculate_mean_isize(bam, chrom, trans)
subnum_reads_per_bin.extend(tcov)
subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F')
return subnum_reads_per_bin, ''
def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args):
coverage, _ = self.count_reads_in_region(chrom, start, end)
_file = open(utilities.getTempFileName(suffix='.bg'), 'w')
previous_value = None
line_string = "{}\t{}\t{}\t{:g}\n"
for tileIndex in range(coverage.shape[0]):
tileCoverage = coverage[tileIndex, :]
value = func_to_call(tileCoverage, func_args)
writeStart = start + tileIndex * self.stepSize
writeEnd = min(writeStart + self.stepSize, end)
_file.write(line_string.format(chrom, writeStart, writeEnd, value))
tempfilename = _file.name
_file.close()
return tempfilename
def run(self, func_to_call, func_args, out_file_name, format="bedgraph"):
bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList]
genome_chunk_length = writeBedGraph.getGenomeChunkLength(bam_handlers, self.binLength)
# check if both bam files correspond to the same species
# by comparing the chromosome names:
chrom_names_and_size, non_common = getCommonChrNames(bam_handlers, verbose=False)
if self.region:
# in case a region is used, append the tilesize
self.region += ":{}".format(self.binLength)
for x in list(self.__dict__.keys()):
sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x)))
res = mapReduce.mapReduce([func_to_call, func_args],
writeBedGraph_wrapper,
chrom_names_and_size,
self_=self,
genomeChunkLength=genome_chunk_length,
region=self.region,
numberOfProcessors=self.numberOfProcessors)
# concatenate intermediary bedgraph files
out_file = open(out_file_name + ".bg", 'wb')
for tempfilename in res:
if tempfilename:
# concatenate all intermediate tempfiles into one
# bedgraph file
_foo = open(tempfilename, 'rb')
shutil.copyfileobj(_foo, out_file)
_foo.close()
os.remove(tempfilename)
bedgraph_file = out_file.name
out_file.close()
if format == 'bedgraph':
sort_cmd = cfg.config.get('external_tools', 'sort')
os.system("LC_ALL=C {} -k1,1 -k2,2n {} > {}".format(sort_cmd, bedgraph_file, out_file_name))
os.remove(bedgraph_file)
if self.verbose:
print("output file: {}".format(out_file_name))
else:
writeBedGraph.bedGraphToBigWig(
chrom_names_and_size, bedgraph_file, out_file_name, True)
if self.verbose:
print("output file: {}".format(out_file_name))
os.remove(bedgraph_file)
wr = foo([BAM], binLength=binLength, stepSize=stepSize, numberOfProcessors=numberOfProcessors, region="19")
func_args = {'scaleFactor': 1.0}
wr.run(writeBedGraph.scaleCoverage, func_args, outFileName, format=outFileFormat)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment