-
-
Save jasonsahl/b5d56c16b04f7cc3bd3c32e22922125f to your computer and use it in GitHub Desktop.
calculates depth and breadth of coverage for BAM file across reference
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""From a BAM file, or read files, and reference genome | |
calculate the amount of genome covered | |
by a minimum coverage value""" | |
from optparse import OptionParser | |
import subprocess | |
import os | |
import glob | |
import sys | |
import re | |
import logging | |
import os.path | |
try: | |
from Bio import SeqIO | |
except: | |
print("Biopython needs to be installed..exiting") | |
sys.exit() | |
def is_tool(name): | |
from distutils.spawn import find_executable | |
return find_executable(name) is not None | |
def test_dir(option, opt_str, value, parser): | |
if os.path.exists(value): | |
setattr(parser.values, option.dest, value) | |
else: | |
print("directory cannot be found") | |
sys.exit() | |
def test_file(option, opt_str, value, parser): | |
file_exists = os.path.exists(value) | |
if file_exists is False: | |
print('%s file cannot be opened' % option) | |
sys.exit() | |
def mp_shell(func, params, numProc): | |
from multiprocessing import Pool | |
p = Pool(numProc) | |
out = p.map(func, params) | |
p.terminate() | |
return out | |
def get_seq_length(ref): | |
outfile = open("tmp.txt", "w") | |
with open(ref) as my_file: | |
for record in SeqIO.parse(my_file, "fasta"): | |
outfile.write(str(record.id)+"\t"+str(len(record.seq))+"\n") | |
outfile.close() | |
def get_seq_name(bam_in): | |
"""used for renaming the sequences""" | |
name = os.path.basename(bam_in) | |
return name | |
def _samtools_workflow(data): | |
f = data | |
base = os.path.basename(f) | |
name = base.replace(".bam","") | |
subprocess.check_call("samtools index %s" % f,stdout=open(os.devnull,'wb'),stderr=open(os.devnull,'wb'),shell=True) | |
subprocess.check_call("samtools depth -aa %s > %s.bam.out" % (f,name),stdout=open(os.devnull,'wb'),stderr=open(os.devnull,'wb'),shell=True) | |
def run_samtools(bam_path,processors): | |
files = [] | |
for infile in glob.glob(os.path.join(bam_path,"*.bam")): | |
files.append(infile) | |
mp_shell(_samtools_workflow,files,processors) | |
def remove_column(): | |
curr_dir=os.getcwd() | |
for infile in glob.glob(os.path.join(curr_dir, '*.bam.out')): | |
name=get_seq_name(infile) | |
outfile = open("%s.coverage.txt" % name, "w") | |
with open(infile) as my_file: | |
for line in my_file: | |
fields=line.split() | |
del fields[1] | |
outfile.write("\t".join(fields)+"\n") | |
outfile.close() | |
def sum_coverage(cov): | |
curr_dir=os.getcwd() | |
positions = [] | |
for infile in glob.glob(os.path.join(curr_dir, '*coverage.txt')): | |
name = get_seq_name(infile) | |
outfile = open("%s.amount_covered.txt" % name, "w") | |
all = [] | |
cov_dict = {} | |
with open(infile) as my_file: | |
for line in my_file: | |
fields=line.split() | |
fields = map(lambda s: s.strip(), fields) | |
all.append(fields) | |
for x,y in all: | |
if int(y)>int(cov): | |
try: | |
cov_dict[x].append(y) | |
except KeyError: | |
cov_dict[x] = [y] | |
else: | |
continue | |
for k,v in cov_dict.items(): | |
outfile.write(k+"\t"+str(len(v))+"\n") | |
positions.append(v) | |
outfile.close() | |
return positions | |
def sum_coverage_dict(cov_dict,percent): | |
curr_dir=os.getcwd() | |
for infile in glob.glob(os.path.join(curr_dir, '*coverage.txt')): | |
name=get_seq_name(infile) | |
outfile = open("%s.amount_covered.txt" % name, "w") | |
redux_name = name.replace(".bam.out.coverage.txt","") | |
indiv_cov = int(cov_dict.get(redux_name)) | |
"""The minimum is 25 percent of the average DOC""" | |
var_cov = round(indiv_cov*percent) | |
all = [] | |
new_cov_dict = {} | |
with open(infile) as my_file: | |
for line in my_file: | |
fields=line.split() | |
fields = map(lambda s: s.strip(), fields) | |
all.append(fields) | |
for x,y in all: | |
if int(y)>int(var_cov): | |
try: | |
new_cov_dict[x].append(y) | |
except KeyError: | |
new_cov_dict[x] = [y] | |
else: | |
continue | |
for k,v in new_cov_dict.items(): | |
outfile.write(k+"\t"+str(len(v))+"\n") | |
#print(redux_name,var_cov) | |
outfile.close() | |
def merge_files_by_column(column,file_1): | |
"""Takes 2 file and merge their columns based on the column. It is assumed | |
that line ordering in the files do not match, so we read both files into memory | |
and join them""" | |
curr_dir=os.getcwd() | |
for infile in glob.glob(os.path.join(curr_dir,'*amount_covered.txt')): | |
join_map = {} | |
with open(file_1) as my_file: | |
for line in my_file: | |
newline = line.strip("\n") | |
row = newline.split() | |
column_value = row.pop(column) | |
join_map[column_value] = row | |
with open(infile) as my_infile: | |
for line in my_infile: | |
newline = line.strip("\n") | |
row = newline.split() | |
column_value = row.pop(column) | |
if column_value in join_map: | |
join_map[column_value].extend(row) | |
name=get_seq_name(infile) | |
fout = open("%s.results.txt" % name, "w") | |
for k,v in join_map.items(): | |
fout.write('\t'.join([k] + v) + '\n') | |
fout.close() | |
def report_stats(): | |
curr_dir=os.getcwd() | |
for infile in glob.glob(os.path.join(curr_dir, '*.results.txt')): | |
name = get_seq_name(infile) | |
outfile = open("%s.finals" % name, "w") | |
redux = name.replace('.out.coverage.txt.amount_covered.txt.results.txt','') | |
with open(infile) as my_file: | |
for line in my_file: | |
fields = line.split() | |
chromosome = fields[0] | |
try: | |
amount = (int(fields[2])/int(fields[1]))*100 | |
outfile.write(str(redux)+"\t"+str(chromosome)+"\t"+str(amount)+"\n") | |
except: | |
outfile.write(str(redux)+"\t"+str(chromosome)+"\t"+"0"+"\n") | |
outfile.close() | |
def report_average_coverage(genome_size): | |
curr_dir=os.getcwd() | |
"""This will be all of your reference sequences""" | |
sequences=[] | |
outfile = open("all_covs", "w") | |
with open(genome_size) as my_file: | |
for line in my_file: | |
newline = line.strip() | |
fields = newline.split() | |
sequences.append(fields[0]) | |
for infile in glob.glob(os.path.join(curr_dir, '*.coverage.txt')): | |
name = get_seq_name(infile) | |
redux = name.replace('.bam.out.coverage.txt','') | |
cov_dict = {} | |
value_dict = {} | |
with open(infile) as my_file: | |
for line in my_file: | |
newline = line.strip() | |
fields = newline.split() | |
try: | |
cov_dict[fields[0]].append(int(fields[1])) | |
except KeyError: | |
cov_dict[fields[0]] = [int(fields[1])] | |
for k,v in cov_dict.items(): | |
if k in sequences: | |
value_dict.update({k:sum(v)/len(v)}) | |
else: | |
value_dict.update({k:"0"}) | |
for k,v in value_dict.items(): | |
outfile.write(str(redux)+"\t"+str(k)+"\t"+str(v)+"\n") | |
outfile.close() | |
def average_chromosomes(results_file): | |
all=[] | |
my_dict = {} | |
outfile = open("all_results_averaged", "w") | |
with open(results_file) as my_file: | |
for line in my_file: | |
fields = line.split() | |
del fields[1] | |
all.append(fields) | |
for x,y in all: | |
try: | |
my_dict[x].append(y) | |
except KeyError: | |
my_dict[x] = [y] | |
for k,v in my_dict.items(): | |
ints=list(map(float,v)) | |
outfile.write(k+"\t"+str(sum(ints)/len(ints))+"\n") | |
outfile.close() | |
def generate_matrix(infile,my_out): | |
cov_dict = {} | |
id_list = [] | |
outfile = open(my_out, "w") | |
with open(infile) as my_file: | |
for line in my_file: | |
fields = line.split() | |
if fields[1] in id_list: | |
pass | |
else: | |
id_list.append(fields[1]) | |
try: | |
cov_dict[fields[0]].append(fields[2]) | |
except: | |
cov_dict[fields[0]] = [fields[2]] | |
id_list.insert(0," ") | |
outfile.write("\t".join(id_list)+"\n") | |
for k,v in cov_dict.items(): | |
outfile.write(str(k)+"\t"+"\t".join(v)+"\n") | |
outfile.close() | |
def make_cov_dict(cov_map): | |
cov_dict = {} | |
with open(cov_map) as my_file: | |
for line in my_file: | |
newline = line.strip() | |
fields = newline.split() | |
try: | |
cov_dict.update({fields[0]:fields[1]}) | |
except: | |
print("problem with map file, exiting...") | |
sys.exit() | |
return cov_dict | |
def get_readFile_components(full_file_path): | |
"""function adapted from: | |
https://github.com/katholt/srst2 - tested""" | |
(file_path,file_name) = os.path.split(full_file_path) | |
m1 = re.match("(.*).gz",file_name) | |
ext = "" | |
if m1 is not None: | |
ext = ".gz" | |
file_name = m1.groups()[0] | |
(file_name_before_ext,ext2) = os.path.splitext(file_name) | |
full_ext = ext2+ext | |
return file_path,file_name_before_ext,full_ext | |
def read_file_sets(dir_path): | |
"""match up pairs of sequence data, adapted from | |
https://github.com/katholt/srst2 will be tough to test | |
with variable names and read paths""" | |
fileSets = {} | |
forward_reads = {} | |
reverse_reads = {} | |
num_paired_readsets = 0 | |
num_single_readsets = 0 | |
for infile in glob.glob(os.path.join(dir_path, "*.fastq.gz")): | |
(file_path,file_name_before_ext,full_ext) = get_readFile_components(infile) | |
m=re.match("(_R*)", file_name_before_ext) | |
if m is None: | |
m=re.match("(.*)("+"_R1"+")(.*)$",file_name_before_ext) | |
if m is not None: | |
(baseName,read) = m.groups()[0], m.groups()[1] | |
forward_reads[baseName] = infile | |
else: | |
m=re.match("(.*)("+"_R2"+")(.*)$",file_name_before_ext) | |
if m is not None: | |
(baseName,read) = m.groups()[0], m.groups()[1] | |
reverse_reads[baseName] = infile | |
else: | |
fileSets[file_name_before_ext] = infile | |
num_single_readsets += 1 | |
else: | |
baseName, read = m.groups()[0], m.groups()[3] | |
if read == "_R1": | |
forward_reads[baseName] = infile | |
elif read == "_R2": | |
reverse_reads[baseName] = infile | |
else: | |
fileSets[file_name_before_ext] = infile | |
num_single_readsets += 1 | |
for sample in forward_reads: | |
if sample in reverse_reads: | |
fileSets[sample] = [forward_reads[sample],reverse_reads[sample]] # store pair | |
num_paired_readsets += 1 | |
else: | |
fileSets[sample] = [forward_reads[sample]] # no reverse found | |
num_single_readsets += 1 | |
for sample in reverse_reads: | |
if sample not in fileSets: | |
fileSets[sample] = reverse_reads[sample] # no forward found | |
num_single_readsets += 1 | |
if num_paired_readsets > 0: | |
logging.info('Total paired readsets found:' + str(num_paired_readsets)) | |
elif num_single_readsets > 0: | |
logging.info('Total single reads found:' + str(num_single_readsets)) | |
return fileSets | |
def _make_bams_workflow(data): | |
name = data[0] | |
read_1 = data[1][0] | |
try: | |
read_2 = data[1][1] | |
except: | |
read_2 = "NULL" | |
reference = data[2] | |
aligner = data[3] | |
if "NULL" not in read_2: | |
if "minimap2" in aligner: | |
subprocess.check_call("minimap2 -ax sr %s %s %s | samtools sort -l 0 -@ 4 - | samtools view -Su -F 4 -o %s.bam -" % (reference,read_1,read_2,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True) | |
else: | |
subprocess.check_call("bowtie2 -x Ref -1 %s -2 %s | samtools sort -l 0 -@ 4 - | samtools view -Su -F 4 -o %s.bam -" % (read_1,read_2,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True) | |
else: | |
if "minimap2" in aligner: | |
subprocess.check_call("minimap2 -ax sr %s %s | samtools sort -l 0 -@ 4 - | samtools view -Su -F 4 -o %s.bam -" % (reference,read_1,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True) | |
else: | |
subprocess.check_call("bowtie2 -x Ref -1 %s | samtools sort -l 0 -@ 4 - | samtools view -Su -F 4 -o %s.bam -" % (read_1,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True) | |
def make_bams(fileSets,ref,processors,aligner): | |
files_and_temp_names = [] | |
for idx,f in fileSets.items(): | |
files_and_temp_names.append([idx,f,ref,aligner]) | |
mp_shell(_make_bams_workflow,files_and_temp_names,processors) | |
def main(aligner,bam_dir,ref,read_dir,cov,processors,cov_map,filter_percent): | |
dependencies = ['minimap2','bowtie2','samtools'] | |
for dependency in dependencies: | |
result = is_tool(dependency) | |
if result is False: | |
print("%s is not installed..exiting" % dependency) | |
sys.exit() | |
start_dir = os.getcwd() | |
start_path = os.path.abspath(start_dir) | |
if bam_dir: | |
bam_path = os.path.abspath(bam_dir) | |
elif read_dir: | |
read_path = os.path.abspath(read_dir) | |
else: | |
print("Either bam_path or read_path needed") | |
sys.exit() | |
file_exists = os.path.exists(ref) | |
if file_exists is False: | |
print('reference file cannot be found') | |
sys.exit() | |
else: | |
ref_path = os.path.abspath(ref) | |
#This is a checkpoint although I don't check if the file is complete | |
if os.path.isfile("all_results"): | |
pass | |
else: | |
if "NULL" in cov_map: | |
pass | |
else: | |
cov_dict=make_cov_dict(cov_map) | |
get_seq_length(ref_path) | |
subprocess.check_call('sed "s/ /\\t/g" tmp.txt > genome_size.txt',stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True) | |
#Get readpair information | |
if read_dir: | |
fileSets=read_file_sets("%s" % read_path) | |
if "minimap2" in aligner: | |
try: | |
os.mkdir("minimap2_bams") | |
os.chdir("minimap2_bams") | |
except: | |
os.chdir("minimap2_bams") | |
elif "bowtie2" in aligner: | |
try: | |
os.system("mkdir bowtie2_bams") | |
os.chdir("bowtie2_bams") | |
subprocess.check_call("bowtie2-build %s Ref" % ref_path ,stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True) | |
except: | |
os.chdir("botwie2_bams") | |
subprocess.check_call("bowtie2-build %s Ref" % ref_path ,stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True) | |
make_bams(fileSets,ref_path,processors,aligner) | |
os.chdir(start_path) | |
run_samtools("minimap2_bams",processors) | |
remove_column() | |
elif bam_dir: | |
run_samtools(bam_path,processors) | |
remove_column() | |
if "NULL" in cov_map: | |
positions = sum_coverage(cov) | |
else: | |
sum_coverage_dict(cov_dict,filter_percent) | |
if len(positions)>0: | |
#If the amount_covered.txt files are empty, this next command will throw an error | |
merge_files_by_column(0,"genome_size.txt") | |
else: | |
print("no positions pass minimum coverage...exiting") | |
sys.exit() | |
report_stats() | |
subprocess.check_call("cat *finals > all_results", stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True) | |
average_chromosomes("all_results") | |
"""The following function is for breadth of coverage""" | |
report_average_coverage("genome_size.txt") | |
generate_matrix("all_results","breadth_matrix.txt") | |
generate_matrix("all_covs","coverage_matrix.txt") | |
try: | |
subprocess.check_call("rm *.bam.out *finals *amount_covered.txt *results.txt all_results_averaged all_results all_covs *.coverage.txt tmp.txt genome_size.txt", | |
stderr=open(os.devnull, 'wb'),shell=True) | |
except: | |
pass | |
if __name__ == "__main__": | |
usage="usage: %prog [options]" | |
parser = OptionParser(usage=usage) | |
parser.add_option("-a", "--aligner", dest="aligner", | |
help="which aligner to use? minimap2 [default] or bowtie2?", | |
action="store", type="string", default="minimap2") | |
parser.add_option("-b", "--input_bam", dest="bam_dir", | |
help="full path to bam directory", | |
action="callback", callback=test_dir, type="string") | |
parser.add_option("-r", "--reference", dest="ref", | |
help="full path to reference fasta [REQUIRED]", | |
action="store", type="string") | |
parser.add_option("-d", "--read_dir", dest="read_dir", | |
help="full path to reads directory", | |
action="callback", callback=test_dir, type="string") | |
parser.add_option("-c", "--coverage", dest="cov", | |
help="minimum coverage across the genome, defaults to 10x, only applies if no map provided", | |
default=10, type="int") | |
parser.add_option("-p", "--processors", dest="processors", | |
help="number of processors to apply to the job, defaults to 2", | |
default=2, type="int") | |
parser.add_option("-m", "--cov_map", dest="cov_map", | |
help="coverage map file for each genome", | |
default="NULL", type="string") | |
parser.add_option("-f", "--filter_percent", dest="filter_percent", | |
help="filter breadth below a percentage of max coverage, based on map, defaults to 0.25", | |
default=0.25, type="float") | |
options, args = parser.parse_args() | |
mandatories = ["ref"] | |
for m in mandatories: | |
if not options.__dict__[m]: | |
print("\nMust provide %s.\n" %m) | |
parser.print_help() | |
exit(-1) | |
main(options.aligner,options.bam_dir,options.ref,options.read_dir,options.cov,options.processors, | |
options.cov_map,options.filter_percent) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment