jasonsahl/mlst_blast_assemblies.py

## mlst_blast_assemblies.py
#!/usr/bin/env python

"""calculates MLST types from assemblies using BLAST.
If the gene is truncated, it will report a "T" and if
the gene has no blast hit, it will report a "X".
Your reference allele names must all end in "fasta"
and must contain a "_" between gene name and number.
The only external dependency is blast+ - tested version
is 2.2.31"""

from __future__ import print_function
import optparse
import glob
import subprocess
import os
import re
import sys
import shutil

def test_file(option, opt_str, value, parser):
    try:
        with open(value): setattr(parser.values, option.dest, value)
    except IOError:
        print('%s cannot be opened' % option)
        sys.exit()

def test_dir(option, opt_str, value, parser):
    if os.path.exists(value):
        setattr(parser.values, option.dest, value)
    else:
        print("%s cannot be found" % option)
        sys.exit()

def mp_shell(func, params, numProc):
    from multiprocessing import Pool
    p = Pool(numProc)
    out = p.map(func, params)
    p.terminate()
    return out

def get_gene_order(reference):
    firstLine = open(reference).readline()
    fields = firstLine.split()
    ordered = fields[1:]
    return ordered

def get_seq_name(in_fasta):
    """used for renaming the sequences"""
    return os.path.basename(in_fasta)

def process_results_dev(directory,ordered,trunc,new,reference,prefix,num_markers):
    outfile = open("%s.profile.txt" % prefix, "w")
    pad_order = list(ordered)
    pad_order.insert(0,"genome")
    pad_order.insert(num_markers+1,"ST")
    outfile.write("\t".join(pad_order))
    outfile.write("\n")
    for infile in glob.glob(os.path.join(directory,"*blast.redux.out")):
        alist = []
        file_name = get_seq_name(infile)
        genome_name = file_name.replace(".fasta.blast.redux.out","")
        final_profile = []
        final_profile.append(genome_name)
        allele_profile = {}
        for allele in ordered:
            for line in open(infile, "rU"):
                newline = line.strip("\n")
                fields = newline.split()
                allele_fields = fields[0].split("_")
                allele_name = allele_fields[0]
                """This is where there is a match"""
                if allele in allele_name:
                    try:
                        if int(fields[3])/int(fields[7])>=trunc and float(fields[2])>=float(new):
                            allele_profile.update({allele:allele_fields[1]})
                        elif int(fields[3])/int(fields[7])>=trunc and float(fields[2])<float(new):
                            allele_profile.update({allele:"N"})
                        else:
                            allele_profile.update({allele:"T"})
                    except:
                        allele_profile.update({allele:"X"})
        for allele in ordered:
            if allele in allele_profile:
                final_profile.append(allele_profile.get(allele))
            else:
                final_profile.append("X")
        if "X" in final_profile[1:]:
            final_profile.insert(len(final_profile),"missing")
        elif "T" in final_profile[1:]:
            final_profile.insert(len(final_profile),"truncated")
        elif "N" in final_profile[1:]:
            final_profile.insert(len(final_profile),"novel")
        else:
            pass
        """If this is the case, I know that there are no ST matches"""
        if "X" in final_profile[1:-1] or "N" in final_profile[1:-1] or "T" in final_profile[1:-1]:
            outfile.write("\t".join(final_profile))
            outfile.write("\n")
        else:
            """need to read in the first line of this file first"""
            firstLine = open(reference).readline()
            first_fields = firstLine.split()
            if "clonal_complex" in first_fields:
                """This ignores the clonal complex and ST fields,
                seems to be working correclty"""
                first_fields = first_fields[1:-1]
                num_fields = len(first_fields[1:])
            else:
                num_fields = len(first_fields[1:])
            for line in open(reference, "rU"):
                sum =[]
                ref_fields=line.split()
                """Sum is how many matches that I have.
                Must equal the number of alleles to receive
                a sequence type"""
                for i in range(1,num_fields+1):
                    try:
                        if ref_fields[i]==final_profile[i]:
                            sum.append(i)
                    except:
                        pass
                if int(len(sum))==int(num_fields):
                    """if there is a complete match, then add the Sequence type
                    to the beginning of the list"""
                    final_profile.insert(int(num_fields)+1,ref_fields[0])
                else:
                    pass
                    #final_profile.insert(int(num_fields)+1,"novel")
        """This assumes that the profile is complete"""
        if len(final_profile) == 9:
            if final_profile[-1] == "novel" or final_profile[-1] == "truncated" or final_profile[-1] == "missing":
                pass
            else:
                outfile.write("\t".join(final_profile))
                outfile.write("\n")
        elif len(final_profile) == 8:
            final_profile.insert(int(num_fields)+1,"novel")
            outfile.write("\t".join(final_profile))
            outfile.write("\n")
    outfile.close()

def _perform_blast_workflow(data):
    genome = "".join(data)
    if genome.endswith(".fasta"):
        try:
            subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % genome, shell=True)
        except:
            print("problem found in formatting genome %s" % genome)
    if ".fasta" in genome:
        try:
            devnull = open('/dev/null', 'w')
            cmd = ["blastn",
                   "-task", "blastn",
                   "-query", "all_loci.fasta",
                   "-db", genome,
                   "-out", "%s_blast.out" % (genome),
                   "-evalue", "0.0001",
                   "-dust", "no",
                   "-outfmt", "6"]
            subprocess.call(cmd, stdout=devnull, stderr=devnull)
            devnull.close()
        except:
            print("genomes %s cannot be used" % genome)
    ref_list = []
    newfile = open("%s.blast.redux.out" % genome, "w")
    os.system("sort -gr -k 12,12 %s_blast.out > %s.blast.sorted" % (genome,genome))
    for line in open("%s.blast.sorted" % genome):
        newline = line.strip()
        fields = line.split()
        allele_names = fields[0].split("_")
        if allele_names[0] in ref_list:
            pass
        else:
            ref_list.append(allele_names[0])
            newfile.write(line)
    newfile.close()

def run_blast_dev(ref_loci_file,processors,genome_list):
    curr_dir=os.getcwd()
    mp_shell(_perform_blast_workflow,genome_list,processors)

def make_ST_map(profile):
    map = {}
    with open(profile) as my_profile:
        for line in my_profile:
            newline = line.strip()
            fields = newline.split()
            if fields[0] == "genome":
                pass
            else:
                if len(fields) == 9:
                    map.update({fields[0]:fields[8]})
                else:
                    map.update({fields[0]:"unknown"})
    return map

def main(directory,fastas,trunc,new,processors,prefix):
    curr_dir = os.getcwd()
    ap = os.path.abspath(curr_dir)
    ref_seq_dir=os.path.abspath(directory)
    reference = "".join(glob.glob(os.path.join(ref_seq_dir, "*.txt")))
    ordered = get_gene_order(reference)
    fastas_path = os.path.abspath(fastas)
    if os.path.exists("%s/mlst_scratch" % ap):
        os.system("rm -rf %s/mlst_scratch" % ap)
        os.makedirs("%s/mlst_scratch" % ap)
    else:
        os.makedirs("%s/mlst_scratch" % ap)
    if os.path.exists("%s/ST_renames" % ap):
        os.system("rm -rf %s/ST_renames" % ap)
        os.makedirs("%s/ST_renames" % ap)
    else:
        os.makedirs("%s/ST_renames" % ap)
    tmp_dir = "%s/mlst_scratch" % ap
    if "clonal_complex" in ordered:
        ordered=ordered[:-1]
    """file_dir is a list of all of your FASTA files"""
    for afile in glob.glob(os.path.join(fastas,"*fasta")):
        shutil.copy2(afile,tmp_dir)
    files_and_temp_names = []
    for infile in glob.glob(os.path.join(tmp_dir,"*fasta")):
        genome = get_seq_name(infile)
        files_and_temp_names.append(genome)
    os.chdir(tmp_dir)
    os.system("cat %s/*fasta > ./all_loci.fasta" % ref_seq_dir)
    print("blast starting")
    run_blast_dev("all_loci.fasta",processors,files_and_temp_names)
    print("blast finished, parsing results")
    process_results_dev(tmp_dir,ordered,trunc,new,reference,prefix,len(ordered))
    st_map = make_ST_map("%s.profile.txt" % prefix)
    for k,v in st_map.items():
        old_name = str(k)+".fasta"
        new_name = str(k)+"_ST"+str(v)+".fasta"
        os.system("cp %s/%s %s/ST_renames/%s" % (fastas_path,old_name,ap,new_name))
    os.system("mv *.profile.txt %s" % ap)
    os.chdir(ap)
    os.system("rm -rf %s/mlst_scratch" % ap)

if __name__ == "__main__":
    usage="usage: %prog [options]"
    parser = optparse.OptionParser(usage=usage)
    parser.add_option("-d", "--MLST_dir", dest="directory",
                      help="/path/to/MLST_seq_directory [REQUIRED]",
                      action="callback", callback=test_dir, type="string")
    parser.add_option("-f", "--fasta dir", dest="fastas",
                      help="/path/to/your_genome_FASTA_dir [REQUIRED]",
                      action="callback", callback=test_dir, type="string")
    parser.add_option("-t", "--truncated value", dest="trunc",
                      help="gene is considered truncated if below this value [default = 0.95]",
                      default="0.95", type="float")
    parser.add_option("-n", "--new", dest="new",
                      help="under this value is considered to be a new allele",
                      default="100", type="int")
    parser.add_option("-p", "--processors", dest="processors",
                      help="Number of processors to use, defaults to 2",
                      default="2", type="int", action="store")
    parser.add_option("-y", "--prefix", dest="prefix",
                      help="Output prefix for files [REQUIRED]",
                      action="store", type="string")
    options, args = parser.parse_args()

    mandatories = ["directory","fastas","prefix"]
    for m in mandatories:
        if not getattr(options, m, None):
            print("\nMust provide %s.\n" %m)
            parser.print_help()
            exit(-1)
    main(options.directory,options.fastas,options.trunc,options.new,options.processors,options.prefix)
	#!/usr/bin/env python

	"""calculates MLST types from assemblies using BLAST.
	If the gene is truncated, it will report a "T" and if
	the gene has no blast hit, it will report a "X".
	Your reference allele names must all end in "fasta"
	and must contain a "_" between gene name and number.
	The only external dependency is blast+ - tested version
	is 2.2.31"""

	from __future__ import print_function
	import optparse
	import glob
	import subprocess
	import os
	import re
	import sys
	import shutil

	def test_file(option, opt_str, value, parser):
	try:
	with open(value): setattr(parser.values, option.dest, value)
	except IOError:
	print('%s cannot be opened' % option)
	sys.exit()

	def test_dir(option, opt_str, value, parser):
	if os.path.exists(value):
	setattr(parser.values, option.dest, value)
	else:
	print("%s cannot be found" % option)
	sys.exit()

	def mp_shell(func, params, numProc):
	from multiprocessing import Pool
	p = Pool(numProc)
	out = p.map(func, params)
	p.terminate()
	return out

	def get_gene_order(reference):
	firstLine = open(reference).readline()
	fields = firstLine.split()
	ordered = fields[1:]
	return ordered

	def get_seq_name(in_fasta):
	"""used for renaming the sequences"""
	return os.path.basename(in_fasta)

	def process_results_dev(directory,ordered,trunc,new,reference,prefix,num_markers):
	outfile = open("%s.profile.txt" % prefix, "w")
	pad_order = list(ordered)
	pad_order.insert(0,"genome")
	pad_order.insert(num_markers+1,"ST")
	outfile.write("\t".join(pad_order))
	outfile.write("\n")
	for infile in glob.glob(os.path.join(directory,"*blast.redux.out")):
	alist = []
	file_name = get_seq_name(infile)
	genome_name = file_name.replace(".fasta.blast.redux.out","")
	final_profile = []
	final_profile.append(genome_name)
	allele_profile = {}
	for allele in ordered:
	for line in open(infile, "rU"):
	newline = line.strip("\n")
	fields = newline.split()
	allele_fields = fields[0].split("_")
	allele_name = allele_fields[0]
	"""This is where there is a match"""
	if allele in allele_name:
	try:
	if int(fields[3])/int(fields[7])>=trunc and float(fields[2])>=float(new):
	allele_profile.update({allele:allele_fields[1]})
	elif int(fields[3])/int(fields[7])>=trunc and float(fields[2])<float(new):
	allele_profile.update({allele:"N"})
	else:
	allele_profile.update({allele:"T"})
	except:
	allele_profile.update({allele:"X"})
	for allele in ordered:
	if allele in allele_profile:
	final_profile.append(allele_profile.get(allele))
	else:
	final_profile.append("X")
	if "X" in final_profile[1:]:
	final_profile.insert(len(final_profile),"missing")
	elif "T" in final_profile[1:]:
	final_profile.insert(len(final_profile),"truncated")
	elif "N" in final_profile[1:]:
	final_profile.insert(len(final_profile),"novel")
	else:
	pass
	"""If this is the case, I know that there are no ST matches"""
	if "X" in final_profile[1:-1] or "N" in final_profile[1:-1] or "T" in final_profile[1:-1]:
	outfile.write("\t".join(final_profile))
	outfile.write("\n")
	else:
	"""need to read in the first line of this file first"""
	firstLine = open(reference).readline()
	first_fields = firstLine.split()
	if "clonal_complex" in first_fields:
	"""This ignores the clonal complex and ST fields,
	seems to be working correclty"""
	first_fields = first_fields[1:-1]
	num_fields = len(first_fields[1:])
	else:
	num_fields = len(first_fields[1:])
	for line in open(reference, "rU"):
	sum =[]
	ref_fields=line.split()
	"""Sum is how many matches that I have.
	Must equal the number of alleles to receive
	a sequence type"""
	for i in range(1,num_fields+1):
	try:
	if ref_fields[i]==final_profile[i]:
	sum.append(i)
	except:
	pass
	if int(len(sum))==int(num_fields):
	"""if there is a complete match, then add the Sequence type
	to the beginning of the list"""
	final_profile.insert(int(num_fields)+1,ref_fields[0])
	else:
	pass
	#final_profile.insert(int(num_fields)+1,"novel")
	"""This assumes that the profile is complete"""
	if len(final_profile) == 9:
	if final_profile[-1] == "novel" or final_profile[-1] == "truncated" or final_profile[-1] == "missing":
	pass
	else:
	outfile.write("\t".join(final_profile))
	outfile.write("\n")
	elif len(final_profile) == 8:
	final_profile.insert(int(num_fields)+1,"novel")
	outfile.write("\t".join(final_profile))
	outfile.write("\n")
	outfile.close()

	def _perform_blast_workflow(data):
	genome = "".join(data)
	if genome.endswith(".fasta"):
	try:
	subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % genome, shell=True)
	except:
	print("problem found in formatting genome %s" % genome)
	if ".fasta" in genome:
	try:
	devnull = open('/dev/null', 'w')
	cmd = ["blastn",
	"-task", "blastn",
	"-query", "all_loci.fasta",
	"-db", genome,
	"-out", "%s_blast.out" % (genome),
	"-evalue", "0.0001",
	"-dust", "no",
	"-outfmt", "6"]
	subprocess.call(cmd, stdout=devnull, stderr=devnull)
	devnull.close()
	except:
	print("genomes %s cannot be used" % genome)
	ref_list = []
	newfile = open("%s.blast.redux.out" % genome, "w")
	os.system("sort -gr -k 12,12 %s_blast.out > %s.blast.sorted" % (genome,genome))
	for line in open("%s.blast.sorted" % genome):
	newline = line.strip()
	fields = line.split()
	allele_names = fields[0].split("_")
	if allele_names[0] in ref_list:
	pass
	else:
	ref_list.append(allele_names[0])
	newfile.write(line)
	newfile.close()

	def run_blast_dev(ref_loci_file,processors,genome_list):
	curr_dir=os.getcwd()
	mp_shell(_perform_blast_workflow,genome_list,processors)

	def make_ST_map(profile):
	map = {}
	with open(profile) as my_profile:
	for line in my_profile:
	newline = line.strip()
	fields = newline.split()
	if fields[0] == "genome":
	pass
	else:
	if len(fields) == 9:
	map.update({fields[0]:fields[8]})
	else:
	map.update({fields[0]:"unknown"})
	return map

	def main(directory,fastas,trunc,new,processors,prefix):
	curr_dir = os.getcwd()
	ap = os.path.abspath(curr_dir)
	ref_seq_dir=os.path.abspath(directory)
	reference = "".join(glob.glob(os.path.join(ref_seq_dir, "*.txt")))
	ordered = get_gene_order(reference)
	fastas_path = os.path.abspath(fastas)
	if os.path.exists("%s/mlst_scratch" % ap):
	os.system("rm -rf %s/mlst_scratch" % ap)
	os.makedirs("%s/mlst_scratch" % ap)
	else:
	os.makedirs("%s/mlst_scratch" % ap)
	if os.path.exists("%s/ST_renames" % ap):
	os.system("rm -rf %s/ST_renames" % ap)
	os.makedirs("%s/ST_renames" % ap)
	else:
	os.makedirs("%s/ST_renames" % ap)
	tmp_dir = "%s/mlst_scratch" % ap
	if "clonal_complex" in ordered:
	ordered=ordered[:-1]
	"""file_dir is a list of all of your FASTA files"""
	for afile in glob.glob(os.path.join(fastas,"*fasta")):
	shutil.copy2(afile,tmp_dir)
	files_and_temp_names = []
	for infile in glob.glob(os.path.join(tmp_dir,"*fasta")):
	genome = get_seq_name(infile)
	files_and_temp_names.append(genome)
	os.chdir(tmp_dir)
	os.system("cat %s/*fasta > ./all_loci.fasta" % ref_seq_dir)
	print("blast starting")
	run_blast_dev("all_loci.fasta",processors,files_and_temp_names)
	print("blast finished, parsing results")
	process_results_dev(tmp_dir,ordered,trunc,new,reference,prefix,len(ordered))
	st_map = make_ST_map("%s.profile.txt" % prefix)
	for k,v in st_map.items():
	old_name = str(k)+".fasta"
	new_name = str(k)+"_ST"+str(v)+".fasta"
	os.system("cp %s/%s %s/ST_renames/%s" % (fastas_path,old_name,ap,new_name))
	os.system("mv *.profile.txt %s" % ap)
	os.chdir(ap)
	os.system("rm -rf %s/mlst_scratch" % ap)

	if __name__ == "__main__":
	usage="usage: %prog [options]"
	parser = optparse.OptionParser(usage=usage)
	parser.add_option("-d", "--MLST_dir", dest="directory",
	help="/path/to/MLST_seq_directory [REQUIRED]",
	action="callback", callback=test_dir, type="string")
	parser.add_option("-f", "--fasta dir", dest="fastas",
	help="/path/to/your_genome_FASTA_dir [REQUIRED]",
	action="callback", callback=test_dir, type="string")
	parser.add_option("-t", "--truncated value", dest="trunc",
	help="gene is considered truncated if below this value [default = 0.95]",
	default="0.95", type="float")
	parser.add_option("-n", "--new", dest="new",
	help="under this value is considered to be a new allele",
	default="100", type="int")
	parser.add_option("-p", "--processors", dest="processors",
	help="Number of processors to use, defaults to 2",
	default="2", type="int", action="store")
	parser.add_option("-y", "--prefix", dest="prefix",
	help="Output prefix for files [REQUIRED]",
	action="store", type="string")
	options, args = parser.parse_args()

	mandatories = ["directory","fastas","prefix"]
	for m in mandatories:
	if not getattr(options, m, None):
	print("\nMust provide %s.\n" %m)
	parser.print_help()
	exit(-1)
	main(options.directory,options.fastas,options.trunc,options.new,options.processors,options.prefix)