Skip to content

Instantly share code, notes, and snippets.

@jasonsahl
Created November 3, 2021 20:16
Show Gist options
  • Save jasonsahl/b5d56c16b04f7cc3bd3c32e22922125f to your computer and use it in GitHub Desktop.
Save jasonsahl/b5d56c16b04f7cc3bd3c32e22922125f to your computer and use it in GitHub Desktop.
calculates depth and breadth of coverage for BAM file across reference
#!/usr/bin/env python
"""From a BAM file, or read files, and reference genome
calculate the amount of genome covered
by a minimum coverage value"""
from optparse import OptionParser
import subprocess
import os
import glob
import sys
import re
import logging
import os.path
try:
from Bio import SeqIO
except:
print("Biopython needs to be installed..exiting")
sys.exit()
def is_tool(name):
from distutils.spawn import find_executable
return find_executable(name) is not None
def test_dir(option, opt_str, value, parser):
if os.path.exists(value):
setattr(parser.values, option.dest, value)
else:
print("directory cannot be found")
sys.exit()
def test_file(option, opt_str, value, parser):
file_exists = os.path.exists(value)
if file_exists is False:
print('%s file cannot be opened' % option)
sys.exit()
def mp_shell(func, params, numProc):
from multiprocessing import Pool
p = Pool(numProc)
out = p.map(func, params)
p.terminate()
return out
def get_seq_length(ref):
outfile = open("tmp.txt", "w")
with open(ref) as my_file:
for record in SeqIO.parse(my_file, "fasta"):
outfile.write(str(record.id)+"\t"+str(len(record.seq))+"\n")
outfile.close()
def get_seq_name(bam_in):
"""used for renaming the sequences"""
name = os.path.basename(bam_in)
return name
def _samtools_workflow(data):
f = data
base = os.path.basename(f)
name = base.replace(".bam","")
subprocess.check_call("samtools index %s" % f,stdout=open(os.devnull,'wb'),stderr=open(os.devnull,'wb'),shell=True)
subprocess.check_call("samtools depth -aa %s > %s.bam.out" % (f,name),stdout=open(os.devnull,'wb'),stderr=open(os.devnull,'wb'),shell=True)
def run_samtools(bam_path,processors):
files = []
for infile in glob.glob(os.path.join(bam_path,"*.bam")):
files.append(infile)
mp_shell(_samtools_workflow,files,processors)
def remove_column():
curr_dir=os.getcwd()
for infile in glob.glob(os.path.join(curr_dir, '*.bam.out')):
name=get_seq_name(infile)
outfile = open("%s.coverage.txt" % name, "w")
with open(infile) as my_file:
for line in my_file:
fields=line.split()
del fields[1]
outfile.write("\t".join(fields)+"\n")
outfile.close()
def sum_coverage(cov):
curr_dir=os.getcwd()
positions = []
for infile in glob.glob(os.path.join(curr_dir, '*coverage.txt')):
name = get_seq_name(infile)
outfile = open("%s.amount_covered.txt" % name, "w")
all = []
cov_dict = {}
with open(infile) as my_file:
for line in my_file:
fields=line.split()
fields = map(lambda s: s.strip(), fields)
all.append(fields)
for x,y in all:
if int(y)>int(cov):
try:
cov_dict[x].append(y)
except KeyError:
cov_dict[x] = [y]
else:
continue
for k,v in cov_dict.items():
outfile.write(k+"\t"+str(len(v))+"\n")
positions.append(v)
outfile.close()
return positions
def sum_coverage_dict(cov_dict,percent):
curr_dir=os.getcwd()
for infile in glob.glob(os.path.join(curr_dir, '*coverage.txt')):
name=get_seq_name(infile)
outfile = open("%s.amount_covered.txt" % name, "w")
redux_name = name.replace(".bam.out.coverage.txt","")
indiv_cov = int(cov_dict.get(redux_name))
"""The minimum is 25 percent of the average DOC"""
var_cov = round(indiv_cov*percent)
all = []
new_cov_dict = {}
with open(infile) as my_file:
for line in my_file:
fields=line.split()
fields = map(lambda s: s.strip(), fields)
all.append(fields)
for x,y in all:
if int(y)>int(var_cov):
try:
new_cov_dict[x].append(y)
except KeyError:
new_cov_dict[x] = [y]
else:
continue
for k,v in new_cov_dict.items():
outfile.write(k+"\t"+str(len(v))+"\n")
#print(redux_name,var_cov)
outfile.close()
def merge_files_by_column(column,file_1):
"""Takes 2 file and merge their columns based on the column. It is assumed
that line ordering in the files do not match, so we read both files into memory
and join them"""
curr_dir=os.getcwd()
for infile in glob.glob(os.path.join(curr_dir,'*amount_covered.txt')):
join_map = {}
with open(file_1) as my_file:
for line in my_file:
newline = line.strip("\n")
row = newline.split()
column_value = row.pop(column)
join_map[column_value] = row
with open(infile) as my_infile:
for line in my_infile:
newline = line.strip("\n")
row = newline.split()
column_value = row.pop(column)
if column_value in join_map:
join_map[column_value].extend(row)
name=get_seq_name(infile)
fout = open("%s.results.txt" % name, "w")
for k,v in join_map.items():
fout.write('\t'.join([k] + v) + '\n')
fout.close()
def report_stats():
curr_dir=os.getcwd()
for infile in glob.glob(os.path.join(curr_dir, '*.results.txt')):
name = get_seq_name(infile)
outfile = open("%s.finals" % name, "w")
redux = name.replace('.out.coverage.txt.amount_covered.txt.results.txt','')
with open(infile) as my_file:
for line in my_file:
fields = line.split()
chromosome = fields[0]
try:
amount = (int(fields[2])/int(fields[1]))*100
outfile.write(str(redux)+"\t"+str(chromosome)+"\t"+str(amount)+"\n")
except:
outfile.write(str(redux)+"\t"+str(chromosome)+"\t"+"0"+"\n")
outfile.close()
def report_average_coverage(genome_size):
curr_dir=os.getcwd()
"""This will be all of your reference sequences"""
sequences=[]
outfile = open("all_covs", "w")
with open(genome_size) as my_file:
for line in my_file:
newline = line.strip()
fields = newline.split()
sequences.append(fields[0])
for infile in glob.glob(os.path.join(curr_dir, '*.coverage.txt')):
name = get_seq_name(infile)
redux = name.replace('.bam.out.coverage.txt','')
cov_dict = {}
value_dict = {}
with open(infile) as my_file:
for line in my_file:
newline = line.strip()
fields = newline.split()
try:
cov_dict[fields[0]].append(int(fields[1]))
except KeyError:
cov_dict[fields[0]] = [int(fields[1])]
for k,v in cov_dict.items():
if k in sequences:
value_dict.update({k:sum(v)/len(v)})
else:
value_dict.update({k:"0"})
for k,v in value_dict.items():
outfile.write(str(redux)+"\t"+str(k)+"\t"+str(v)+"\n")
outfile.close()
def average_chromosomes(results_file):
all=[]
my_dict = {}
outfile = open("all_results_averaged", "w")
with open(results_file) as my_file:
for line in my_file:
fields = line.split()
del fields[1]
all.append(fields)
for x,y in all:
try:
my_dict[x].append(y)
except KeyError:
my_dict[x] = [y]
for k,v in my_dict.items():
ints=list(map(float,v))
outfile.write(k+"\t"+str(sum(ints)/len(ints))+"\n")
outfile.close()
def generate_matrix(infile,my_out):
cov_dict = {}
id_list = []
outfile = open(my_out, "w")
with open(infile) as my_file:
for line in my_file:
fields = line.split()
if fields[1] in id_list:
pass
else:
id_list.append(fields[1])
try:
cov_dict[fields[0]].append(fields[2])
except:
cov_dict[fields[0]] = [fields[2]]
id_list.insert(0," ")
outfile.write("\t".join(id_list)+"\n")
for k,v in cov_dict.items():
outfile.write(str(k)+"\t"+"\t".join(v)+"\n")
outfile.close()
def make_cov_dict(cov_map):
cov_dict = {}
with open(cov_map) as my_file:
for line in my_file:
newline = line.strip()
fields = newline.split()
try:
cov_dict.update({fields[0]:fields[1]})
except:
print("problem with map file, exiting...")
sys.exit()
return cov_dict
def get_readFile_components(full_file_path):
"""function adapted from:
https://github.com/katholt/srst2 - tested"""
(file_path,file_name) = os.path.split(full_file_path)
m1 = re.match("(.*).gz",file_name)
ext = ""
if m1 is not None:
ext = ".gz"
file_name = m1.groups()[0]
(file_name_before_ext,ext2) = os.path.splitext(file_name)
full_ext = ext2+ext
return file_path,file_name_before_ext,full_ext
def read_file_sets(dir_path):
"""match up pairs of sequence data, adapted from
https://github.com/katholt/srst2 will be tough to test
with variable names and read paths"""
fileSets = {}
forward_reads = {}
reverse_reads = {}
num_paired_readsets = 0
num_single_readsets = 0
for infile in glob.glob(os.path.join(dir_path, "*.fastq.gz")):
(file_path,file_name_before_ext,full_ext) = get_readFile_components(infile)
m=re.match("(_R*)", file_name_before_ext)
if m is None:
m=re.match("(.*)("+"_R1"+")(.*)$",file_name_before_ext)
if m is not None:
(baseName,read) = m.groups()[0], m.groups()[1]
forward_reads[baseName] = infile
else:
m=re.match("(.*)("+"_R2"+")(.*)$",file_name_before_ext)
if m is not None:
(baseName,read) = m.groups()[0], m.groups()[1]
reverse_reads[baseName] = infile
else:
fileSets[file_name_before_ext] = infile
num_single_readsets += 1
else:
baseName, read = m.groups()[0], m.groups()[3]
if read == "_R1":
forward_reads[baseName] = infile
elif read == "_R2":
reverse_reads[baseName] = infile
else:
fileSets[file_name_before_ext] = infile
num_single_readsets += 1
for sample in forward_reads:
if sample in reverse_reads:
fileSets[sample] = [forward_reads[sample],reverse_reads[sample]] # store pair
num_paired_readsets += 1
else:
fileSets[sample] = [forward_reads[sample]] # no reverse found
num_single_readsets += 1
for sample in reverse_reads:
if sample not in fileSets:
fileSets[sample] = reverse_reads[sample] # no forward found
num_single_readsets += 1
if num_paired_readsets > 0:
logging.info('Total paired readsets found:' + str(num_paired_readsets))
elif num_single_readsets > 0:
logging.info('Total single reads found:' + str(num_single_readsets))
return fileSets
def _make_bams_workflow(data):
name = data[0]
read_1 = data[1][0]
try:
read_2 = data[1][1]
except:
read_2 = "NULL"
reference = data[2]
aligner = data[3]
if "NULL" not in read_2:
if "minimap2" in aligner:
subprocess.check_call("minimap2 -ax sr %s %s %s | samtools sort -l 0 -@ 4 - | samtools view -Su -F 4 -o %s.bam -" % (reference,read_1,read_2,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
else:
subprocess.check_call("bowtie2 -x Ref -1 %s -2 %s | samtools sort -l 0 -@ 4 - | samtools view -Su -F 4 -o %s.bam -" % (read_1,read_2,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
else:
if "minimap2" in aligner:
subprocess.check_call("minimap2 -ax sr %s %s | samtools sort -l 0 -@ 4 - | samtools view -Su -F 4 -o %s.bam -" % (reference,read_1,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
else:
subprocess.check_call("bowtie2 -x Ref -1 %s | samtools sort -l 0 -@ 4 - | samtools view -Su -F 4 -o %s.bam -" % (read_1,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
def make_bams(fileSets,ref,processors,aligner):
files_and_temp_names = []
for idx,f in fileSets.items():
files_and_temp_names.append([idx,f,ref,aligner])
mp_shell(_make_bams_workflow,files_and_temp_names,processors)
def main(aligner,bam_dir,ref,read_dir,cov,processors,cov_map,filter_percent):
dependencies = ['minimap2','bowtie2','samtools']
for dependency in dependencies:
result = is_tool(dependency)
if result is False:
print("%s is not installed..exiting" % dependency)
sys.exit()
start_dir = os.getcwd()
start_path = os.path.abspath(start_dir)
if bam_dir:
bam_path = os.path.abspath(bam_dir)
elif read_dir:
read_path = os.path.abspath(read_dir)
else:
print("Either bam_path or read_path needed")
sys.exit()
file_exists = os.path.exists(ref)
if file_exists is False:
print('reference file cannot be found')
sys.exit()
else:
ref_path = os.path.abspath(ref)
#This is a checkpoint although I don't check if the file is complete
if os.path.isfile("all_results"):
pass
else:
if "NULL" in cov_map:
pass
else:
cov_dict=make_cov_dict(cov_map)
get_seq_length(ref_path)
subprocess.check_call('sed "s/ /\\t/g" tmp.txt > genome_size.txt',stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
#Get readpair information
if read_dir:
fileSets=read_file_sets("%s" % read_path)
if "minimap2" in aligner:
try:
os.mkdir("minimap2_bams")
os.chdir("minimap2_bams")
except:
os.chdir("minimap2_bams")
elif "bowtie2" in aligner:
try:
os.system("mkdir bowtie2_bams")
os.chdir("bowtie2_bams")
subprocess.check_call("bowtie2-build %s Ref" % ref_path ,stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
except:
os.chdir("botwie2_bams")
subprocess.check_call("bowtie2-build %s Ref" % ref_path ,stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
make_bams(fileSets,ref_path,processors,aligner)
os.chdir(start_path)
run_samtools("minimap2_bams",processors)
remove_column()
elif bam_dir:
run_samtools(bam_path,processors)
remove_column()
if "NULL" in cov_map:
positions = sum_coverage(cov)
else:
sum_coverage_dict(cov_dict,filter_percent)
if len(positions)>0:
#If the amount_covered.txt files are empty, this next command will throw an error
merge_files_by_column(0,"genome_size.txt")
else:
print("no positions pass minimum coverage...exiting")
sys.exit()
report_stats()
subprocess.check_call("cat *finals > all_results", stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
average_chromosomes("all_results")
"""The following function is for breadth of coverage"""
report_average_coverage("genome_size.txt")
generate_matrix("all_results","breadth_matrix.txt")
generate_matrix("all_covs","coverage_matrix.txt")
try:
subprocess.check_call("rm *.bam.out *finals *amount_covered.txt *results.txt all_results_averaged all_results all_covs *.coverage.txt tmp.txt genome_size.txt",
stderr=open(os.devnull, 'wb'),shell=True)
except:
pass
if __name__ == "__main__":
usage="usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("-a", "--aligner", dest="aligner",
help="which aligner to use? minimap2 [default] or bowtie2?",
action="store", type="string", default="minimap2")
parser.add_option("-b", "--input_bam", dest="bam_dir",
help="full path to bam directory",
action="callback", callback=test_dir, type="string")
parser.add_option("-r", "--reference", dest="ref",
help="full path to reference fasta [REQUIRED]",
action="store", type="string")
parser.add_option("-d", "--read_dir", dest="read_dir",
help="full path to reads directory",
action="callback", callback=test_dir, type="string")
parser.add_option("-c", "--coverage", dest="cov",
help="minimum coverage across the genome, defaults to 10x, only applies if no map provided",
default=10, type="int")
parser.add_option("-p", "--processors", dest="processors",
help="number of processors to apply to the job, defaults to 2",
default=2, type="int")
parser.add_option("-m", "--cov_map", dest="cov_map",
help="coverage map file for each genome",
default="NULL", type="string")
parser.add_option("-f", "--filter_percent", dest="filter_percent",
help="filter breadth below a percentage of max coverage, based on map, defaults to 0.25",
default=0.25, type="float")
options, args = parser.parse_args()
mandatories = ["ref"]
for m in mandatories:
if not options.__dict__[m]:
print("\nMust provide %s.\n" %m)
parser.print_help()
exit(-1)
main(options.aligner,options.bam_dir,options.ref,options.read_dir,options.cov,options.processors,
options.cov_map,options.filter_percent)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment