Last active
April 24, 2017 11:35
-
-
Save dpryan79/fbbdd4a6f897ccecec69239782256876 to your computer and use it in GitHub Desktop.
Convert a BAM file to a bigWig or bedGraph file containing the mean insert size (uses the deepTools API)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import deeptools.utilities | |
from deeptools import bamHandler | |
from deeptools import mapReduce | |
from deeptools import writeBedGraph | |
from deeptools import utilities | |
from deeptools.utilities import getCommonChrNames, toBytes | |
import sys | |
import shutil | |
import os | |
import numpy as np | |
BAM = "something.bam" # This is a BAM file name | |
binLength = 1000 # Window size | |
stepSize = 50 # The step size by which each window is shifted | |
numberOfProcessors = 20 # Number of threads | |
outFileName = "something.bigWig" # Ouput file name | |
outFileFormat = "bigwig" # or "bedgraph" | |
def writeBedGraph_wrapper(args): | |
""" | |
Passes the arguments to writeBedGraph_worker. | |
This is a step required given | |
the constrains from the multiprocessing module. | |
The args var, contains as first element the 'self' value | |
from the WriteBedGraph object | |
""" | |
return foo.writeBedGraph_worker(*args) | |
class foo(writeBedGraph.WriteBedGraph): | |
def calculate_mean_isize(self, bamHandle, chrom, regions): | |
rv = np.zeros(len(regions), dtype='float64') | |
for idx, reg in enumerate(regions): | |
vals = [r.template_length for r in bamHandle.fetch(chrom, reg[0], reg[1]) if r.flag & 4 == 0] | |
if len(vals) > 0: | |
rv[idx] = np.mean(np.abs(vals)) | |
return rv | |
def count_reads_in_region(self, chrom, start, end): | |
bam_handlers = [] | |
for fname in self.bamFilesList: | |
bam_handlers.append(bamHandler.openBam(fname)) | |
# A list of lists of tuples | |
transcriptsToConsider = [] | |
for i in range(start, end, self.stepSize): | |
if i + self.binLength > end: | |
break | |
transcriptsToConsider.append([(i, i + self.binLength)]) | |
subnum_reads_per_bin = [] | |
for bam in bam_handlers: | |
for trans in transcriptsToConsider: | |
tcov = self.calculate_mean_isize(bam, chrom, trans) | |
subnum_reads_per_bin.extend(tcov) | |
subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F') | |
return subnum_reads_per_bin, '' | |
def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args): | |
coverage, _ = self.count_reads_in_region(chrom, start, end) | |
_file = open(utilities.getTempFileName(suffix='.bg'), 'w') | |
previous_value = None | |
line_string = "{}\t{}\t{}\t{:g}\n" | |
for tileIndex in range(coverage.shape[0]): | |
tileCoverage = coverage[tileIndex, :] | |
value = func_to_call(tileCoverage, func_args) | |
writeStart = start + tileIndex * self.stepSize | |
writeEnd = min(writeStart + self.stepSize, end) | |
_file.write(line_string.format(chrom, writeStart, writeEnd, value)) | |
tempfilename = _file.name | |
_file.close() | |
return tempfilename | |
def run(self, func_to_call, func_args, out_file_name, format="bedgraph"): | |
bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList] | |
genome_chunk_length = writeBedGraph.getGenomeChunkLength(bam_handlers, self.binLength) | |
# check if both bam files correspond to the same species | |
# by comparing the chromosome names: | |
chrom_names_and_size, non_common = getCommonChrNames(bam_handlers, verbose=False) | |
if self.region: | |
# in case a region is used, append the tilesize | |
self.region += ":{}".format(self.binLength) | |
for x in list(self.__dict__.keys()): | |
sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x))) | |
res = mapReduce.mapReduce([func_to_call, func_args], | |
writeBedGraph_wrapper, | |
chrom_names_and_size, | |
self_=self, | |
genomeChunkLength=genome_chunk_length, | |
region=self.region, | |
numberOfProcessors=self.numberOfProcessors) | |
# concatenate intermediary bedgraph files | |
out_file = open(out_file_name + ".bg", 'wb') | |
for tempfilename in res: | |
if tempfilename: | |
# concatenate all intermediate tempfiles into one | |
# bedgraph file | |
_foo = open(tempfilename, 'rb') | |
shutil.copyfileobj(_foo, out_file) | |
_foo.close() | |
os.remove(tempfilename) | |
bedgraph_file = out_file.name | |
out_file.close() | |
if format == 'bedgraph': | |
sort_cmd = cfg.config.get('external_tools', 'sort') | |
os.system("LC_ALL=C {} -k1,1 -k2,2n {} > {}".format(sort_cmd, bedgraph_file, out_file_name)) | |
os.remove(bedgraph_file) | |
if self.verbose: | |
print("output file: {}".format(out_file_name)) | |
else: | |
writeBedGraph.bedGraphToBigWig( | |
chrom_names_and_size, bedgraph_file, out_file_name, True) | |
if self.verbose: | |
print("output file: {}".format(out_file_name)) | |
os.remove(bedgraph_file) | |
wr = foo([BAM], binLength=binLength, stepSize=stepSize, numberOfProcessors=numberOfProcessors, region="19") | |
func_args = {'scaleFactor': 1.0} | |
wr.run(writeBedGraph.scaleCoverage, func_args, outFileName, format=outFileFormat) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment