Skip to content

Instantly share code, notes, and snippets.

@arq5x
Created February 22, 2017 03:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arq5x/bfd53cd6925e729f554be93535b619ec to your computer and use it in GitHub Desktop.
Save arq5x/bfd53cd6925e729f554be93535b619ec to your computer and use it in GitHub Desktop.
base2
#!/usr/bin/env python
"""
given a callable file from goleft depth on stdin, merge adjacent LOW_COVERAGE and CALLABLE regions.
also split regions that are larger than max_region.
this is to even out parallelism for sending regions to freebayes.
"""
from __future__ import print_function, division
import sys
from itertools import groupby
from operator import itemgetter
# maximum region size to send to freebayes.
max_region = 2000000
def print_region(lines, excessive=False):
if len(lines) == 0: return
chrom = lines[0][0]
assert chrom == lines[-1][0]
s, e = int(lines[0][1]), int(lines[-1][2])
assert s < e, ("please sort input bed file", lines)
# for testing: cat 15-0022949.callable.sav.bed | ./pipeline/merge-callable.py - | bedtools subtract -a - -b \
# 15-0022949.callable.sav.bed | grep -v NO_C | grep -v EXCESS
#print("%s\t%s\t%s" % (chrom, lines[0][1], lines[-1][2]))
if excessive:
# if excessive coverage, make into a smaller region to reduce memory-use.
assert len(lines) == 1
chrom, start, end = lines[0][:3]
even_split(chrom, start, end, max_region/6.0)
elif e - s > 0.8 * max_region:
even_split(chrom, s, e, max_region/2.0)
else:
print("%s:%s-%s" % (chrom, s, e))
def even_split(chrom, start, end, max_region=max_region):
"""
split large regions evenly. this helps to avoid having the last
region tiny.
"""
# use 0.75 so we have regions that are under max_region
n = int((end - start) / (0.7 * max_region))
size = (end - start) // n
for s in range(start, end, size):
print_region([[chrom, s, min(end, s + size)]])
def xopen(path):
if path == "-": return sys.stdin
if path.endswith(".gz"): return gzip.open(path)
return open(path)
keep_excessive = len(sys.argv) > 2
for chrom, lines in groupby((x.rstrip("\r\n").split("\t") for x in xopen(sys.argv[1])), itemgetter(0)):
last = []
if chrom == "hs37d5": continue
for toks in lines:
if len(toks) == 3:
toks.append("CALLABLE")
if toks[3] == "EXCESSIVE_COVERAGE":
if last:
print_region(last)
last = last[:0]
if keep_excessive:
print_region([toks], True)
continue
# just include NO_COV so we dont send tiny regions to fb
if toks[3] == "NO_COVERAGE":
if len(last) == 0: continue
# if we have a reasonable size chunk already, then we just
# send that one off and skipt this no_coverage block.
if int(last[-1][2]) - int(last[-1][2]) > 0.2 * max_region:
print_region(last)
continue
# otherwise, we append the no coverage.
last.append(toks)
continue
end = int(toks[2])
last.append(toks)
# if we have a huge callable region, split into smaller pieces.
if end - int(last[0][1]) > max_region:
even_split(chrom, int(last[0][1]), end)
last = last[:0]
if last:
print_region(last)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment