jasonsahl/get_coverage_bam_threaded.py Secret

## get_coverage_bam_threaded.py
#!/usr/bin/env python

"""From a BAM file, or read files, and reference genome
calculate the amount of genome covered
by a minimum coverage value"""

from optparse import OptionParser
import subprocess
import os
import glob
import sys
import re
import logging
import os.path
try:
    from Bio import SeqIO
except:
    print("Biopython needs to be installed..exiting")
    sys.exit()

def is_tool(name):
    from distutils.spawn import find_executable
    return find_executable(name) is not None

def test_dir(option, opt_str, value, parser):
    if os.path.exists(value):
        setattr(parser.values, option.dest, value)
    else:
        print("directory cannot be found")
        sys.exit()

def test_file(option, opt_str, value, parser):

    file_exists = os.path.exists(value)
    if file_exists is False:
        print('%s file cannot be opened' % option)
        sys.exit()

def mp_shell(func, params, numProc):
    from multiprocessing import Pool
    p = Pool(numProc)
    out = p.map(func, params)
    p.terminate()
    return out

def get_seq_length(ref):
    outfile = open("tmp.txt", "w")
    with open(ref) as my_file:
        for record in SeqIO.parse(my_file, "fasta"):
            outfile.write(str(record.id)+"\t"+str(len(record.seq))+"\n")
    outfile.close()

def get_seq_name(bam_in):
    """used for renaming the sequences"""
    name = os.path.basename(bam_in)
    return name

def _samtools_workflow(data):
    f = data
    base = os.path.basename(f)
    name = base.replace(".bam","")
    subprocess.check_call("samtools index %s" % f,stdout=open(os.devnull,'wb'),stderr=open(os.devnull,'wb'),shell=True)
    subprocess.check_call("samtools depth -aa %s > %s.bam.out" % (f,name),stdout=open(os.devnull,'wb'),stderr=open(os.devnull,'wb'),shell=True)

def run_samtools(bam_path,processors):
    files = []
    for infile in glob.glob(os.path.join(bam_path,"*.bam")):
        files.append(infile)
    mp_shell(_samtools_workflow,files,processors)

def remove_column():
    curr_dir=os.getcwd()
    for infile in glob.glob(os.path.join(curr_dir, '*.bam.out')):
        name=get_seq_name(infile)
        outfile = open("%s.coverage.txt" % name, "w")
        with open(infile) as my_file:
            for line in my_file:
                fields=line.split()
                del fields[1]
                outfile.write("\t".join(fields)+"\n")
        outfile.close()

def sum_coverage(cov):
    curr_dir=os.getcwd()
    positions = []
    for infile in glob.glob(os.path.join(curr_dir, '*coverage.txt')):
        name = get_seq_name(infile)
        outfile = open("%s.amount_covered.txt" % name, "w")
        all = []
        cov_dict = {}
        with open(infile) as my_file:
            for line in my_file:
                fields=line.split()
                fields = map(lambda s: s.strip(), fields)
                all.append(fields)
        for x,y in all:
            if int(y)>int(cov):
                try:
                    cov_dict[x].append(y)
                except KeyError:
                    cov_dict[x] = [y]
                else:
                    continue
        for k,v in cov_dict.items():
            outfile.write(k+"\t"+str(len(v))+"\n")
            positions.append(v)
        outfile.close()
    return positions

def sum_coverage_dict(cov_dict,percent):
    curr_dir=os.getcwd()
    for infile in glob.glob(os.path.join(curr_dir, '*coverage.txt')):
        name=get_seq_name(infile)
        outfile = open("%s.amount_covered.txt" % name, "w")
        redux_name = name.replace(".bam.out.coverage.txt","")
        indiv_cov = int(cov_dict.get(redux_name))
        """The minimum is 25 percent of the average DOC"""
        var_cov = round(indiv_cov*percent)
        all = []
        new_cov_dict = {}
        with open(infile) as my_file:
            for line in my_file:
                fields=line.split()
                fields = map(lambda s: s.strip(), fields)
                all.append(fields)
        for x,y in all:
            if int(y)>int(var_cov):
                try:
                    new_cov_dict[x].append(y)
                except KeyError:
                    new_cov_dict[x] = [y]
                else:
                    continue
        for k,v in new_cov_dict.items():
            outfile.write(k+"\t"+str(len(v))+"\n")
        #print(redux_name,var_cov)
        outfile.close()

def merge_files_by_column(column,file_1):
    """Takes 2 file and merge their columns based on the column. It is assumed
    that line ordering in the files do not match, so we read both files into memory
    and join them"""
    curr_dir=os.getcwd()
    for infile in glob.glob(os.path.join(curr_dir,'*amount_covered.txt')):
        join_map = {}
        with open(file_1) as my_file:
            for line in my_file:
                newline = line.strip("\n")
                row = newline.split()
                column_value = row.pop(column)
                join_map[column_value] = row
        with open(infile) as my_infile:
            for line in my_infile:
                newline = line.strip("\n")
                row = newline.split()
                column_value = row.pop(column)
                if column_value in join_map:
                    join_map[column_value].extend(row)
        name=get_seq_name(infile)
        fout = open("%s.results.txt" % name, "w")
        for k,v in join_map.items():
            fout.write('\t'.join([k] + v) + '\n')
        fout.close()

def report_stats():
    curr_dir=os.getcwd()
    for infile in glob.glob(os.path.join(curr_dir, '*.results.txt')):
        name = get_seq_name(infile)
        outfile = open("%s.finals" % name, "w")
        redux = name.replace('.out.coverage.txt.amount_covered.txt.results.txt','')
        with open(infile) as my_file:
            for line in my_file:
                fields = line.split()
                chromosome = fields[0]
                try:
                    amount = (int(fields[2])/int(fields[1]))*100
                    outfile.write(str(redux)+"\t"+str(chromosome)+"\t"+str(amount)+"\n")
                except:
                    outfile.write(str(redux)+"\t"+str(chromosome)+"\t"+"0"+"\n")
        outfile.close()

def report_average_coverage(genome_size):
    curr_dir=os.getcwd()
    """This will be all of your reference sequences"""
    sequences=[]
    outfile = open("all_covs", "w")
    with open(genome_size) as my_file:
        for line in my_file:
            newline = line.strip()
            fields = newline.split()
            sequences.append(fields[0])
    for infile in glob.glob(os.path.join(curr_dir, '*.coverage.txt')):
        name = get_seq_name(infile)
        redux = name.replace('.bam.out.coverage.txt','')
        cov_dict = {}
        value_dict = {}
        with open(infile) as my_file:
            for line in my_file:
                newline = line.strip()
                fields = newline.split()
                try:
                    cov_dict[fields[0]].append(int(fields[1]))
                except KeyError:
                    cov_dict[fields[0]] = [int(fields[1])]
        for k,v in cov_dict.items():
            if k in sequences:
                value_dict.update({k:sum(v)/len(v)})
            else:
                value_dict.update({k:"0"})
        for k,v in value_dict.items():
            outfile.write(str(redux)+"\t"+str(k)+"\t"+str(v)+"\n")
    outfile.close()

def average_chromosomes(results_file):
    all=[]
    my_dict = {}
    outfile = open("all_results_averaged", "w")
    with open(results_file) as my_file:
        for line in my_file:
            fields = line.split()
            del fields[1]
            all.append(fields)
    for x,y in all:
        try:
            my_dict[x].append(y)
        except KeyError:
            my_dict[x] = [y]
    for k,v in my_dict.items():
        ints=list(map(float,v))
        outfile.write(k+"\t"+str(sum(ints)/len(ints))+"\n")
    outfile.close()

def generate_matrix(infile,my_out):
    cov_dict = {}
    id_list = []
    outfile = open(my_out, "w")
    with open(infile) as my_file:
        for line in my_file:
            fields = line.split()
            if fields[1] in id_list:
                pass
            else:
                id_list.append(fields[1])
            try:
                cov_dict[fields[0]].append(fields[2])
            except:
                cov_dict[fields[0]] = [fields[2]]
    id_list.insert(0," ")
    outfile.write("\t".join(id_list)+"\n")
    for k,v in cov_dict.items():
        outfile.write(str(k)+"\t"+"\t".join(v)+"\n")
    outfile.close()

def make_cov_dict(cov_map):
    cov_dict = {}
    with open(cov_map) as my_file:
        for line in my_file:
            newline = line.strip()
            fields = newline.split()
            try:
                cov_dict.update({fields[0]:fields[1]})
            except:
                print("problem with map file, exiting...")
                sys.exit()
    return cov_dict

def get_readFile_components(full_file_path):
    """function adapted from:
    https://github.com/katholt/srst2 - tested"""
    (file_path,file_name) = os.path.split(full_file_path)
    m1 = re.match("(.*).gz",file_name)
    ext = ""
    if m1 is not None:
        ext = ".gz"
        file_name = m1.groups()[0]
    (file_name_before_ext,ext2) = os.path.splitext(file_name)
    full_ext = ext2+ext
    return file_path,file_name_before_ext,full_ext

def read_file_sets(dir_path):
    """match up pairs of sequence data, adapted from
    https://github.com/katholt/srst2 will be tough to test
    with variable names and read paths"""
    fileSets = {}
    forward_reads = {}
    reverse_reads = {}
    num_paired_readsets = 0
    num_single_readsets = 0
    for infile in glob.glob(os.path.join(dir_path, "*.fastq.gz")):
        (file_path,file_name_before_ext,full_ext) = get_readFile_components(infile)
        m=re.match("(_R*)", file_name_before_ext)
        if m is None:
            m=re.match("(.*)("+"_R1"+")(.*)$",file_name_before_ext)
            if m is not None:
                (baseName,read) = m.groups()[0], m.groups()[1]
                forward_reads[baseName] = infile
            else:
                m=re.match("(.*)("+"_R2"+")(.*)$",file_name_before_ext)
                if m is not None:
                    (baseName,read) = m.groups()[0], m.groups()[1]
                    reverse_reads[baseName] = infile
                else:
                    fileSets[file_name_before_ext] = infile
                    num_single_readsets += 1
        else:
            baseName, read  = m.groups()[0], m.groups()[3]
            if read == "_R1":
                forward_reads[baseName] = infile
            elif read == "_R2":
                reverse_reads[baseName] = infile
            else:
                fileSets[file_name_before_ext] = infile
                num_single_readsets += 1
    for sample in forward_reads:
        if sample in reverse_reads:
            fileSets[sample] = [forward_reads[sample],reverse_reads[sample]] # store pair
            num_paired_readsets += 1
        else:
            fileSets[sample] = [forward_reads[sample]] # no reverse found
            num_single_readsets += 1
    for sample in reverse_reads:
        if sample not in fileSets:
            fileSets[sample] = reverse_reads[sample] # no forward found
            num_single_readsets += 1
    if num_paired_readsets > 0:
        logging.info('Total paired readsets found:' + str(num_paired_readsets))
    elif num_single_readsets > 0:
        logging.info('Total single reads found:' + str(num_single_readsets))
    return fileSets

def _make_bams_workflow(data):
    name = data[0]
    read_1 = data[1][0]
    try:
        read_2 = data[1][1]
    except:
        read_2 = "NULL"
    reference = data[2]
    aligner = data[3]
    if "NULL" not in read_2:
        if "minimap2" in aligner:
            subprocess.check_call("minimap2 -ax sr %s %s %s | samtools sort -l 0 -@ 4 - | samtools view -Su -F 4 -o %s.bam -" % (reference,read_1,read_2,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
        else:
            subprocess.check_call("bowtie2 -x Ref -1 %s -2 %s | samtools sort -l 0 -@ 4 - | samtools view -Su -F 4 -o %s.bam -" % (read_1,read_2,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
    else:
        if "minimap2" in aligner:
            subprocess.check_call("minimap2 -ax sr %s %s | samtools sort -l 0 -@ 4 - | samtools view -Su -F 4 -o %s.bam -" % (reference,read_1,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
        else:
            subprocess.check_call("bowtie2 -x Ref -1 %s | samtools sort -l 0 -@ 4 - | samtools view -Su -F 4 -o %s.bam -" % (read_1,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)

def make_bams(fileSets,ref,processors,aligner):
    files_and_temp_names = []
    for idx,f in fileSets.items():
        files_and_temp_names.append([idx,f,ref,aligner])
    mp_shell(_make_bams_workflow,files_and_temp_names,processors)

def main(aligner,bam_dir,ref,read_dir,cov,processors,cov_map,filter_percent):
    dependencies = ['minimap2','bowtie2','samtools']
    for dependency in dependencies:
        result = is_tool(dependency)
        if result is False:
            print("%s is not installed..exiting" % dependency)
            sys.exit()
    start_dir = os.getcwd()
    start_path = os.path.abspath(start_dir)
    if bam_dir:
        bam_path = os.path.abspath(bam_dir)
    elif read_dir:
        read_path = os.path.abspath(read_dir)
    else:
        print("Either bam_path or read_path needed")
        sys.exit()
    file_exists = os.path.exists(ref)
    if file_exists is False:
        print('reference file cannot be found')
        sys.exit()
    else:
        ref_path = os.path.abspath(ref)
    #This is a checkpoint although I don't check if the file is complete
    if os.path.isfile("all_results"):
        pass
    else:
        if "NULL" in cov_map:
            pass
        else:
            cov_dict=make_cov_dict(cov_map)
        get_seq_length(ref_path)
        subprocess.check_call('sed "s/ /\\t/g" tmp.txt > genome_size.txt',stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
        #Get readpair information
        if read_dir:
            fileSets=read_file_sets("%s" % read_path)
            if "minimap2" in aligner:
                try:
                    os.mkdir("minimap2_bams")
                    os.chdir("minimap2_bams")
                except:
                    os.chdir("minimap2_bams")
            elif "bowtie2" in aligner:
                try:
                    os.system("mkdir bowtie2_bams")
                    os.chdir("bowtie2_bams")
                    subprocess.check_call("bowtie2-build %s Ref" % ref_path ,stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)

                except:
                    os.chdir("botwie2_bams")
                    subprocess.check_call("bowtie2-build %s Ref" % ref_path ,stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
            make_bams(fileSets,ref_path,processors,aligner)
            os.chdir(start_path)
            run_samtools("minimap2_bams",processors)
            remove_column()
        elif bam_dir:
            run_samtools(bam_path,processors)
            remove_column()
        if "NULL" in cov_map:
            positions = sum_coverage(cov)
        else:
            sum_coverage_dict(cov_dict,filter_percent)
        if len(positions)>0:
            #If the amount_covered.txt files are empty, this next command will throw an error
            merge_files_by_column(0,"genome_size.txt")
        else:
            print("no positions pass minimum coverage...exiting")
            sys.exit()
        report_stats()
        subprocess.check_call("cat *finals > all_results", stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
        average_chromosomes("all_results")
    """The following function is for breadth of coverage"""
    report_average_coverage("genome_size.txt")
    generate_matrix("all_results","breadth_matrix.txt")
    generate_matrix("all_covs","coverage_matrix.txt")
    try:
        subprocess.check_call("rm *.bam.out *finals *amount_covered.txt *results.txt all_results_averaged all_results all_covs *.coverage.txt tmp.txt genome_size.txt",
        stderr=open(os.devnull, 'wb'),shell=True)
    except:
        pass

if __name__ == "__main__":
    usage="usage: %prog [options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-a", "--aligner", dest="aligner",
                     help="which aligner to use? minimap2 [default] or bowtie2?",
                     action="store", type="string", default="minimap2")
    parser.add_option("-b", "--input_bam", dest="bam_dir",
                     help="full path to bam directory",
                     action="callback", callback=test_dir, type="string")
    parser.add_option("-r", "--reference", dest="ref",
                     help="full path to reference fasta [REQUIRED]",
                     action="store", type="string")
    parser.add_option("-d", "--read_dir", dest="read_dir",
                     help="full path to reads directory",
                     action="callback", callback=test_dir, type="string")
    parser.add_option("-c", "--coverage", dest="cov",
                     help="minimum coverage across the genome, defaults to 10x, only applies if no map provided",
                     default=10, type="int")
    parser.add_option("-p", "--processors", dest="processors",
                     help="number of processors to apply to the job, defaults to 2",
                     default=2, type="int")
    parser.add_option("-m", "--cov_map", dest="cov_map",
                     help="coverage map file for each genome",
                     default="NULL", type="string")
    parser.add_option("-f", "--filter_percent", dest="filter_percent",
                     help="filter breadth below a percentage of max coverage, based on map, defaults to 0.25",
                     default=0.25, type="float")
    options, args = parser.parse_args()

    mandatories = ["ref"]
    for m in mandatories:
        if not options.__dict__[m]:
            print("\nMust provide %s.\n" %m)
            parser.print_help()
            exit(-1)

    main(options.aligner,options.bam_dir,options.ref,options.read_dir,options.cov,options.processors,
        options.cov_map,options.filter_percent)
	#!/usr/bin/env python

	"""From a BAM file, or read files, and reference genome
	calculate the amount of genome covered
	by a minimum coverage value"""

	from optparse import OptionParser
	import subprocess
	import os
	import glob
	import sys
	import re
	import logging
	import os.path
	try:
	from Bio import SeqIO
	except:
	print("Biopython needs to be installed..exiting")
	sys.exit()

	def is_tool(name):
	from distutils.spawn import find_executable
	return find_executable(name) is not None

	def test_dir(option, opt_str, value, parser):
	if os.path.exists(value):
	setattr(parser.values, option.dest, value)
	else:
	print("directory cannot be found")
	sys.exit()

	def test_file(option, opt_str, value, parser):

	file_exists = os.path.exists(value)
	if file_exists is False:
	print('%s file cannot be opened' % option)
	sys.exit()

	def mp_shell(func, params, numProc):
	from multiprocessing import Pool
	p = Pool(numProc)
	out = p.map(func, params)
	p.terminate()
	return out

	def get_seq_length(ref):
	outfile = open("tmp.txt", "w")
	with open(ref) as my_file:
	for record in SeqIO.parse(my_file, "fasta"):
	outfile.write(str(record.id)+"\t"+str(len(record.seq))+"\n")
	outfile.close()

	def get_seq_name(bam_in):
	"""used for renaming the sequences"""
	name = os.path.basename(bam_in)
	return name

	def _samtools_workflow(data):
	f = data
	base = os.path.basename(f)
	name = base.replace(".bam","")
	subprocess.check_call("samtools index %s" % f,stdout=open(os.devnull,'wb'),stderr=open(os.devnull,'wb'),shell=True)
	subprocess.check_call("samtools depth -aa %s > %s.bam.out" % (f,name),stdout=open(os.devnull,'wb'),stderr=open(os.devnull,'wb'),shell=True)

	def run_samtools(bam_path,processors):
	files = []
	for infile in glob.glob(os.path.join(bam_path,"*.bam")):
	files.append(infile)
	mp_shell(_samtools_workflow,files,processors)

	def remove_column():
	curr_dir=os.getcwd()
	for infile in glob.glob(os.path.join(curr_dir, '*.bam.out')):
	name=get_seq_name(infile)
	outfile = open("%s.coverage.txt" % name, "w")
	with open(infile) as my_file:
	for line in my_file:
	fields=line.split()
	del fields[1]
	outfile.write("\t".join(fields)+"\n")
	outfile.close()

	def sum_coverage(cov):
	curr_dir=os.getcwd()
	positions = []
	for infile in glob.glob(os.path.join(curr_dir, '*coverage.txt')):
	name = get_seq_name(infile)
	outfile = open("%s.amount_covered.txt" % name, "w")
	all = []
	cov_dict = {}
	with open(infile) as my_file:
	for line in my_file:
	fields=line.split()
	fields = map(lambda s: s.strip(), fields)
	all.append(fields)
	for x,y in all:
	if int(y)>int(cov):
	try:
	cov_dict[x].append(y)
	except KeyError:
	cov_dict[x] = [y]
	else:
	continue
	for k,v in cov_dict.items():
	outfile.write(k+"\t"+str(len(v))+"\n")
	positions.append(v)
	outfile.close()
	return positions

	def sum_coverage_dict(cov_dict,percent):
	curr_dir=os.getcwd()
	for infile in glob.glob(os.path.join(curr_dir, '*coverage.txt')):
	name=get_seq_name(infile)
	outfile = open("%s.amount_covered.txt" % name, "w")
	redux_name = name.replace(".bam.out.coverage.txt","")
	indiv_cov = int(cov_dict.get(redux_name))
	"""The minimum is 25 percent of the average DOC"""
	var_cov = round(indiv_cov*percent)
	all = []
	new_cov_dict = {}
	with open(infile) as my_file:
	for line in my_file:
	fields=line.split()
	fields = map(lambda s: s.strip(), fields)
	all.append(fields)
	for x,y in all:
	if int(y)>int(var_cov):
	try:
	new_cov_dict[x].append(y)
	except KeyError:
	new_cov_dict[x] = [y]
	else:
	continue
	for k,v in new_cov_dict.items():
	outfile.write(k+"\t"+str(len(v))+"\n")
	#print(redux_name,var_cov)
	outfile.close()

	def merge_files_by_column(column,file_1):
	"""Takes 2 file and merge their columns based on the column. It is assumed
	that line ordering in the files do not match, so we read both files into memory
	and join them"""
	curr_dir=os.getcwd()
	for infile in glob.glob(os.path.join(curr_dir,'*amount_covered.txt')):
	join_map = {}
	with open(file_1) as my_file:
	for line in my_file:
	newline = line.strip("\n")
	row = newline.split()
	column_value = row.pop(column)
	join_map[column_value] = row
	with open(infile) as my_infile:
	for line in my_infile:
	newline = line.strip("\n")
	row = newline.split()
	column_value = row.pop(column)
	if column_value in join_map:
	join_map[column_value].extend(row)
	name=get_seq_name(infile)
	fout = open("%s.results.txt" % name, "w")
	for k,v in join_map.items():
	fout.write('\t'.join([k] + v) + '\n')
	fout.close()

	def report_stats():
	curr_dir=os.getcwd()
	for infile in glob.glob(os.path.join(curr_dir, '*.results.txt')):
	name = get_seq_name(infile)
	outfile = open("%s.finals" % name, "w")
	redux = name.replace('.out.coverage.txt.amount_covered.txt.results.txt','')
	with open(infile) as my_file:
	for line in my_file:
	fields = line.split()
	chromosome = fields[0]
	try:
	amount = (int(fields[2])/int(fields[1]))*100
	outfile.write(str(redux)+"\t"+str(chromosome)+"\t"+str(amount)+"\n")
	except:
	outfile.write(str(redux)+"\t"+str(chromosome)+"\t"+"0"+"\n")
	outfile.close()

	def report_average_coverage(genome_size):
	curr_dir=os.getcwd()
	"""This will be all of your reference sequences"""
	sequences=[]
	outfile = open("all_covs", "w")
	with open(genome_size) as my_file:
	for line in my_file:
	newline = line.strip()
	fields = newline.split()
	sequences.append(fields[0])
	for infile in glob.glob(os.path.join(curr_dir, '*.coverage.txt')):
	name = get_seq_name(infile)
	redux = name.replace('.bam.out.coverage.txt','')
	cov_dict = {}
	value_dict = {}
	with open(infile) as my_file:
	for line in my_file:
	newline = line.strip()
	fields = newline.split()
	try:
	cov_dict[fields[0]].append(int(fields[1]))
	except KeyError:
	cov_dict[fields[0]] = [int(fields[1])]
	for k,v in cov_dict.items():
	if k in sequences:
	value_dict.update({k:sum(v)/len(v)})
	else:
	value_dict.update({k:"0"})
	for k,v in value_dict.items():
	outfile.write(str(redux)+"\t"+str(k)+"\t"+str(v)+"\n")
	outfile.close()

	def average_chromosomes(results_file):
	all=[]
	my_dict = {}
	outfile = open("all_results_averaged", "w")
	with open(results_file) as my_file:
	for line in my_file:
	fields = line.split()
	del fields[1]
	all.append(fields)
	for x,y in all:
	try:
	my_dict[x].append(y)
	except KeyError:
	my_dict[x] = [y]
	for k,v in my_dict.items():
	ints=list(map(float,v))
	outfile.write(k+"\t"+str(sum(ints)/len(ints))+"\n")
	outfile.close()

	def generate_matrix(infile,my_out):
	cov_dict = {}
	id_list = []
	outfile = open(my_out, "w")
	with open(infile) as my_file:
	for line in my_file:
	fields = line.split()
	if fields[1] in id_list:
	pass
	else:
	id_list.append(fields[1])
	try:
	cov_dict[fields[0]].append(fields[2])
	except:
	cov_dict[fields[0]] = [fields[2]]
	id_list.insert(0," ")
	outfile.write("\t".join(id_list)+"\n")
	for k,v in cov_dict.items():
	outfile.write(str(k)+"\t"+"\t".join(v)+"\n")
	outfile.close()

	def make_cov_dict(cov_map):
	cov_dict = {}
	with open(cov_map) as my_file:
	for line in my_file:
	newline = line.strip()
	fields = newline.split()
	try:
	cov_dict.update({fields[0]:fields[1]})
	except:
	print("problem with map file, exiting...")
	sys.exit()
	return cov_dict

	def get_readFile_components(full_file_path):
	"""function adapted from:
	https://github.com/katholt/srst2 - tested"""
	(file_path,file_name) = os.path.split(full_file_path)
	m1 = re.match("(.*).gz",file_name)
	ext = ""
	if m1 is not None:
	ext = ".gz"
	file_name = m1.groups()[0]
	(file_name_before_ext,ext2) = os.path.splitext(file_name)
	full_ext = ext2+ext
	return file_path,file_name_before_ext,full_ext

	def read_file_sets(dir_path):
	"""match up pairs of sequence data, adapted from
	https://github.com/katholt/srst2 will be tough to test
	with variable names and read paths"""
	fileSets = {}
	forward_reads = {}
	reverse_reads = {}
	num_paired_readsets = 0
	num_single_readsets = 0
	for infile in glob.glob(os.path.join(dir_path, "*.fastq.gz")):
	(file_path,file_name_before_ext,full_ext) = get_readFile_components(infile)
	m=re.match("(_R*)", file_name_before_ext)
	if m is None:
	m=re.match("(.)("+"_R1"+")(.)$",file_name_before_ext)
	if m is not None:
	(baseName,read) = m.groups()[0], m.groups()[1]
	forward_reads[baseName] = infile
	else:
	m=re.match("(.)("+"_R2"+")(.)$",file_name_before_ext)
	if m is not None:
	(baseName,read) = m.groups()[0], m.groups()[1]
	reverse_reads[baseName] = infile
	else:
	fileSets[file_name_before_ext] = infile
	num_single_readsets += 1
	else:
	baseName, read = m.groups()[0], m.groups()[3]
	if read == "_R1":
	forward_reads[baseName] = infile
	elif read == "_R2":
	reverse_reads[baseName] = infile
	else:
	fileSets[file_name_before_ext] = infile
	num_single_readsets += 1
	for sample in forward_reads:
	if sample in reverse_reads:
	fileSets[sample] = [forward_reads[sample],reverse_reads[sample]] # store pair
	num_paired_readsets += 1
	else:
	fileSets[sample] = [forward_reads[sample]] # no reverse found
	num_single_readsets += 1
	for sample in reverse_reads:
	if sample not in fileSets:
	fileSets[sample] = reverse_reads[sample] # no forward found
	num_single_readsets += 1
	if num_paired_readsets > 0:
	logging.info('Total paired readsets found:' + str(num_paired_readsets))
	elif num_single_readsets > 0:
	logging.info('Total single reads found:' + str(num_single_readsets))
	return fileSets

	def _make_bams_workflow(data):
	name = data[0]
	read_1 = data[1][0]
	try:
	read_2 = data[1][1]
	except:
	read_2 = "NULL"
	reference = data[2]
	aligner = data[3]
	if "NULL" not in read_2:
	if "minimap2" in aligner:
	subprocess.check_call("minimap2 -ax sr %s %s %s \| samtools sort -l 0 -@ 4 - \| samtools view -Su -F 4 -o %s.bam -" % (reference,read_1,read_2,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
	else:
	subprocess.check_call("bowtie2 -x Ref -1 %s -2 %s \| samtools sort -l 0 -@ 4 - \| samtools view -Su -F 4 -o %s.bam -" % (read_1,read_2,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
	else:
	if "minimap2" in aligner:
	subprocess.check_call("minimap2 -ax sr %s %s \| samtools sort -l 0 -@ 4 - \| samtools view -Su -F 4 -o %s.bam -" % (reference,read_1,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
	else:
	subprocess.check_call("bowtie2 -x Ref -1 %s \| samtools sort -l 0 -@ 4 - \| samtools view -Su -F 4 -o %s.bam -" % (read_1,name),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)

	def make_bams(fileSets,ref,processors,aligner):
	files_and_temp_names = []
	for idx,f in fileSets.items():
	files_and_temp_names.append([idx,f,ref,aligner])
	mp_shell(_make_bams_workflow,files_and_temp_names,processors)

	def main(aligner,bam_dir,ref,read_dir,cov,processors,cov_map,filter_percent):
	dependencies = ['minimap2','bowtie2','samtools']
	for dependency in dependencies:
	result = is_tool(dependency)
	if result is False:
	print("%s is not installed..exiting" % dependency)
	sys.exit()
	start_dir = os.getcwd()
	start_path = os.path.abspath(start_dir)
	if bam_dir:
	bam_path = os.path.abspath(bam_dir)
	elif read_dir:
	read_path = os.path.abspath(read_dir)
	else:
	print("Either bam_path or read_path needed")
	sys.exit()
	file_exists = os.path.exists(ref)
	if file_exists is False:
	print('reference file cannot be found')
	sys.exit()
	else:
	ref_path = os.path.abspath(ref)
	#This is a checkpoint although I don't check if the file is complete
	if os.path.isfile("all_results"):
	pass
	else:
	if "NULL" in cov_map:
	pass
	else:
	cov_dict=make_cov_dict(cov_map)
	get_seq_length(ref_path)
	subprocess.check_call('sed "s/ /\\t/g" tmp.txt > genome_size.txt',stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
	#Get readpair information
	if read_dir:
	fileSets=read_file_sets("%s" % read_path)
	if "minimap2" in aligner:
	try:
	os.mkdir("minimap2_bams")
	os.chdir("minimap2_bams")
	except:
	os.chdir("minimap2_bams")
	elif "bowtie2" in aligner:
	try:
	os.system("mkdir bowtie2_bams")
	os.chdir("bowtie2_bams")
	subprocess.check_call("bowtie2-build %s Ref" % ref_path ,stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)

	except:
	os.chdir("botwie2_bams")
	subprocess.check_call("bowtie2-build %s Ref" % ref_path ,stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
	make_bams(fileSets,ref_path,processors,aligner)
	os.chdir(start_path)
	run_samtools("minimap2_bams",processors)
	remove_column()
	elif bam_dir:
	run_samtools(bam_path,processors)
	remove_column()
	if "NULL" in cov_map:
	positions = sum_coverage(cov)
	else:
	sum_coverage_dict(cov_dict,filter_percent)
	if len(positions)>0:
	#If the amount_covered.txt files are empty, this next command will throw an error
	merge_files_by_column(0,"genome_size.txt")
	else:
	print("no positions pass minimum coverage...exiting")
	sys.exit()
	report_stats()
	subprocess.check_call("cat *finals > all_results", stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
	average_chromosomes("all_results")
	"""The following function is for breadth of coverage"""
	report_average_coverage("genome_size.txt")
	generate_matrix("all_results","breadth_matrix.txt")
	generate_matrix("all_covs","coverage_matrix.txt")
	try:
	subprocess.check_call("rm .bam.out finals amount_covered.txt results.txt all_results_averaged all_results all_covs *.coverage.txt tmp.txt genome_size.txt",
	stderr=open(os.devnull, 'wb'),shell=True)
	except:
	pass

	if __name__ == "__main__":
	usage="usage: %prog [options]"
	parser = OptionParser(usage=usage)
	parser.add_option("-a", "--aligner", dest="aligner",
	help="which aligner to use? minimap2 [default] or bowtie2?",
	action="store", type="string", default="minimap2")
	parser.add_option("-b", "--input_bam", dest="bam_dir",
	help="full path to bam directory",
	action="callback", callback=test_dir, type="string")
	parser.add_option("-r", "--reference", dest="ref",
	help="full path to reference fasta [REQUIRED]",
	action="store", type="string")
	parser.add_option("-d", "--read_dir", dest="read_dir",
	help="full path to reads directory",
	action="callback", callback=test_dir, type="string")
	parser.add_option("-c", "--coverage", dest="cov",
	help="minimum coverage across the genome, defaults to 10x, only applies if no map provided",
	default=10, type="int")
	parser.add_option("-p", "--processors", dest="processors",
	help="number of processors to apply to the job, defaults to 2",
	default=2, type="int")
	parser.add_option("-m", "--cov_map", dest="cov_map",
	help="coverage map file for each genome",
	default="NULL", type="string")
	parser.add_option("-f", "--filter_percent", dest="filter_percent",
	help="filter breadth below a percentage of max coverage, based on map, defaults to 0.25",
	default=0.25, type="float")
	options, args = parser.parse_args()

	mandatories = ["ref"]
	for m in mandatories:
	if not options.__dict__[m]:
	print("\nMust provide %s.\n" %m)
	parser.print_help()
	exit(-1)

	main(options.aligner,options.bam_dir,options.ref,options.read_dir,options.cov,options.processors,
	options.cov_map,options.filter_percent)