Skip to content

Instantly share code, notes, and snippets.

@jasonsahl
Last active November 1, 2018 18:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jasonsahl/2eedc0ea93f90097890879e56b0c3fa3 to your computer and use it in GitHub Desktop.
Save jasonsahl/2eedc0ea93f90097890879e56b0c3fa3 to your computer and use it in GitHub Desktop.
Uses BLAST to perform MLST typing on genome assemblies
#!/usr/bin/env python
"""calculates MLST types from assemblies using BLAST.
If the gene is truncated, it will report a "T" and if
the gene has no blast hit, it will report a "X".
Your reference allele names must all end in "fasta"
and must contain a "_" between gene name and number.
The only external dependency is blast+ - tested version
is 2.2.31"""
from __future__ import print_function
import optparse
import glob
import subprocess
import os
import re
import sys
import shutil
def test_file(option, opt_str, value, parser):
try:
with open(value): setattr(parser.values, option.dest, value)
except IOError:
print('%s cannot be opened' % option)
sys.exit()
def test_dir(option, opt_str, value, parser):
if os.path.exists(value):
setattr(parser.values, option.dest, value)
else:
print("%s cannot be found" % option)
sys.exit()
def mp_shell(func, params, numProc):
from multiprocessing import Pool
p = Pool(numProc)
out = p.map(func, params)
p.terminate()
return out
def get_gene_order(reference):
firstLine = open(reference).readline()
fields = firstLine.split()
ordered = fields[1:]
return ordered
def get_seq_name(in_fasta):
"""used for renaming the sequences"""
return os.path.basename(in_fasta)
def process_results_dev(directory,ordered,trunc,new,reference,prefix,num_markers):
outfile = open("%s.profile.txt" % prefix, "w")
pad_order = list(ordered)
pad_order.insert(0,"genome")
pad_order.insert(num_markers+1,"ST")
outfile.write("\t".join(pad_order))
outfile.write("\n")
for infile in glob.glob(os.path.join(directory,"*blast.redux.out")):
alist = []
file_name = get_seq_name(infile)
genome_name = file_name.replace(".fasta.blast.redux.out","")
final_profile = []
final_profile.append(genome_name)
allele_profile = {}
for allele in ordered:
for line in open(infile, "rU"):
newline = line.strip("\n")
fields = newline.split()
allele_fields = fields[0].split("_")
allele_name = allele_fields[0]
"""This is where there is a match"""
if allele in allele_name:
try:
if int(fields[3])/int(fields[7])>=trunc and float(fields[2])>=float(new):
allele_profile.update({allele:allele_fields[1]})
elif int(fields[3])/int(fields[7])>=trunc and float(fields[2])<float(new):
allele_profile.update({allele:"N"})
else:
allele_profile.update({allele:"T"})
except:
allele_profile.update({allele:"X"})
for allele in ordered:
if allele in allele_profile:
final_profile.append(allele_profile.get(allele))
else:
final_profile.append("X")
if "X" in final_profile[1:]:
final_profile.insert(len(final_profile),"missing")
elif "T" in final_profile[1:]:
final_profile.insert(len(final_profile),"truncated")
elif "N" in final_profile[1:]:
final_profile.insert(len(final_profile),"novel")
else:
pass
"""If this is the case, I know that there are no ST matches"""
if "X" in final_profile[1:-1] or "N" in final_profile[1:-1] or "T" in final_profile[1:-1]:
outfile.write("\t".join(final_profile))
outfile.write("\n")
else:
"""need to read in the first line of this file first"""
firstLine = open(reference).readline()
first_fields = firstLine.split()
if "clonal_complex" in first_fields:
"""This ignores the clonal complex and ST fields,
seems to be working correclty"""
first_fields = first_fields[1:-1]
num_fields = len(first_fields[1:])
else:
num_fields = len(first_fields[1:])
for line in open(reference, "rU"):
sum =[]
ref_fields=line.split()
"""Sum is how many matches that I have.
Must equal the number of alleles to receive
a sequence type"""
for i in range(1,num_fields+1):
try:
if ref_fields[i]==final_profile[i]:
sum.append(i)
except:
pass
if int(len(sum))==int(num_fields):
"""if there is a complete match, then add the Sequence type
to the beginning of the list"""
final_profile.insert(int(num_fields)+1,ref_fields[0])
else:
pass
#final_profile.insert(int(num_fields)+1,"novel")
"""This assumes that the profile is complete"""
if len(final_profile) == 9:
if final_profile[-1] == "novel" or final_profile[-1] == "truncated" or final_profile[-1] == "missing":
pass
else:
outfile.write("\t".join(final_profile))
outfile.write("\n")
elif len(final_profile) == 8:
final_profile.insert(int(num_fields)+1,"novel")
outfile.write("\t".join(final_profile))
outfile.write("\n")
outfile.close()
def _perform_blast_workflow(data):
genome = "".join(data)
if genome.endswith(".fasta"):
try:
subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % genome, shell=True)
except:
print("problem found in formatting genome %s" % genome)
if ".fasta" in genome:
try:
devnull = open('/dev/null', 'w')
cmd = ["blastn",
"-task", "blastn",
"-query", "all_loci.fasta",
"-db", genome,
"-out", "%s_blast.out" % (genome),
"-evalue", "0.0001",
"-dust", "no",
"-outfmt", "6"]
subprocess.call(cmd, stdout=devnull, stderr=devnull)
devnull.close()
except:
print("genomes %s cannot be used" % genome)
ref_list = []
newfile = open("%s.blast.redux.out" % genome, "w")
os.system("sort -gr -k 12,12 %s_blast.out > %s.blast.sorted" % (genome,genome))
for line in open("%s.blast.sorted" % genome):
newline = line.strip()
fields = line.split()
allele_names = fields[0].split("_")
if allele_names[0] in ref_list:
pass
else:
ref_list.append(allele_names[0])
newfile.write(line)
newfile.close()
def run_blast_dev(ref_loci_file,processors,genome_list):
curr_dir=os.getcwd()
mp_shell(_perform_blast_workflow,genome_list,processors)
def make_ST_map(profile):
map = {}
with open(profile) as my_profile:
for line in my_profile:
newline = line.strip()
fields = newline.split()
if fields[0] == "genome":
pass
else:
if len(fields) == 9:
map.update({fields[0]:fields[8]})
else:
map.update({fields[0]:"unknown"})
return map
def main(directory,fastas,trunc,new,processors,prefix):
curr_dir = os.getcwd()
ap = os.path.abspath(curr_dir)
ref_seq_dir=os.path.abspath(directory)
reference = "".join(glob.glob(os.path.join(ref_seq_dir, "*.txt")))
ordered = get_gene_order(reference)
fastas_path = os.path.abspath(fastas)
if os.path.exists("%s/mlst_scratch" % ap):
os.system("rm -rf %s/mlst_scratch" % ap)
os.makedirs("%s/mlst_scratch" % ap)
else:
os.makedirs("%s/mlst_scratch" % ap)
if os.path.exists("%s/ST_renames" % ap):
os.system("rm -rf %s/ST_renames" % ap)
os.makedirs("%s/ST_renames" % ap)
else:
os.makedirs("%s/ST_renames" % ap)
tmp_dir = "%s/mlst_scratch" % ap
if "clonal_complex" in ordered:
ordered=ordered[:-1]
"""file_dir is a list of all of your FASTA files"""
for afile in glob.glob(os.path.join(fastas,"*fasta")):
shutil.copy2(afile,tmp_dir)
files_and_temp_names = []
for infile in glob.glob(os.path.join(tmp_dir,"*fasta")):
genome = get_seq_name(infile)
files_and_temp_names.append(genome)
os.chdir(tmp_dir)
os.system("cat %s/*fasta > ./all_loci.fasta" % ref_seq_dir)
print("blast starting")
run_blast_dev("all_loci.fasta",processors,files_and_temp_names)
print("blast finished, parsing results")
process_results_dev(tmp_dir,ordered,trunc,new,reference,prefix,len(ordered))
st_map = make_ST_map("%s.profile.txt" % prefix)
for k,v in st_map.items():
old_name = str(k)+".fasta"
new_name = str(k)+"_ST"+str(v)+".fasta"
os.system("cp %s/%s %s/ST_renames/%s" % (fastas_path,old_name,ap,new_name))
os.system("mv *.profile.txt %s" % ap)
os.chdir(ap)
os.system("rm -rf %s/mlst_scratch" % ap)
if __name__ == "__main__":
usage="usage: %prog [options]"
parser = optparse.OptionParser(usage=usage)
parser.add_option("-d", "--MLST_dir", dest="directory",
help="/path/to/MLST_seq_directory [REQUIRED]",
action="callback", callback=test_dir, type="string")
parser.add_option("-f", "--fasta dir", dest="fastas",
help="/path/to/your_genome_FASTA_dir [REQUIRED]",
action="callback", callback=test_dir, type="string")
parser.add_option("-t", "--truncated value", dest="trunc",
help="gene is considered truncated if below this value [default = 0.95]",
default="0.95", type="float")
parser.add_option("-n", "--new", dest="new",
help="under this value is considered to be a new allele",
default="100", type="int")
parser.add_option("-p", "--processors", dest="processors",
help="Number of processors to use, defaults to 2",
default="2", type="int", action="store")
parser.add_option("-y", "--prefix", dest="prefix",
help="Output prefix for files [REQUIRED]",
action="store", type="string")
options, args = parser.parse_args()
mandatories = ["directory","fastas","prefix"]
for m in mandatories:
if not getattr(options, m, None):
print("\nMust provide %s.\n" %m)
parser.print_help()
exit(-1)
main(options.directory,options.fastas,options.trunc,options.new,options.processors,options.prefix)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment