Last active
November 1, 2018 18:02
-
-
Save jasonsahl/2eedc0ea93f90097890879e56b0c3fa3 to your computer and use it in GitHub Desktop.
Uses BLAST to perform MLST typing on genome assemblies
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""calculates MLST types from assemblies using BLAST. | |
If the gene is truncated, it will report a "T" and if | |
the gene has no blast hit, it will report a "X". | |
Your reference allele names must all end in "fasta" | |
and must contain a "_" between gene name and number. | |
The only external dependency is blast+ - tested version | |
is 2.2.31""" | |
from __future__ import print_function | |
import optparse | |
import glob | |
import subprocess | |
import os | |
import re | |
import sys | |
import shutil | |
def test_file(option, opt_str, value, parser): | |
try: | |
with open(value): setattr(parser.values, option.dest, value) | |
except IOError: | |
print('%s cannot be opened' % option) | |
sys.exit() | |
def test_dir(option, opt_str, value, parser): | |
if os.path.exists(value): | |
setattr(parser.values, option.dest, value) | |
else: | |
print("%s cannot be found" % option) | |
sys.exit() | |
def mp_shell(func, params, numProc): | |
from multiprocessing import Pool | |
p = Pool(numProc) | |
out = p.map(func, params) | |
p.terminate() | |
return out | |
def get_gene_order(reference): | |
firstLine = open(reference).readline() | |
fields = firstLine.split() | |
ordered = fields[1:] | |
return ordered | |
def get_seq_name(in_fasta): | |
"""used for renaming the sequences""" | |
return os.path.basename(in_fasta) | |
def process_results_dev(directory,ordered,trunc,new,reference,prefix,num_markers): | |
outfile = open("%s.profile.txt" % prefix, "w") | |
pad_order = list(ordered) | |
pad_order.insert(0,"genome") | |
pad_order.insert(num_markers+1,"ST") | |
outfile.write("\t".join(pad_order)) | |
outfile.write("\n") | |
for infile in glob.glob(os.path.join(directory,"*blast.redux.out")): | |
alist = [] | |
file_name = get_seq_name(infile) | |
genome_name = file_name.replace(".fasta.blast.redux.out","") | |
final_profile = [] | |
final_profile.append(genome_name) | |
allele_profile = {} | |
for allele in ordered: | |
for line in open(infile, "rU"): | |
newline = line.strip("\n") | |
fields = newline.split() | |
allele_fields = fields[0].split("_") | |
allele_name = allele_fields[0] | |
"""This is where there is a match""" | |
if allele in allele_name: | |
try: | |
if int(fields[3])/int(fields[7])>=trunc and float(fields[2])>=float(new): | |
allele_profile.update({allele:allele_fields[1]}) | |
elif int(fields[3])/int(fields[7])>=trunc and float(fields[2])<float(new): | |
allele_profile.update({allele:"N"}) | |
else: | |
allele_profile.update({allele:"T"}) | |
except: | |
allele_profile.update({allele:"X"}) | |
for allele in ordered: | |
if allele in allele_profile: | |
final_profile.append(allele_profile.get(allele)) | |
else: | |
final_profile.append("X") | |
if "X" in final_profile[1:]: | |
final_profile.insert(len(final_profile),"missing") | |
elif "T" in final_profile[1:]: | |
final_profile.insert(len(final_profile),"truncated") | |
elif "N" in final_profile[1:]: | |
final_profile.insert(len(final_profile),"novel") | |
else: | |
pass | |
"""If this is the case, I know that there are no ST matches""" | |
if "X" in final_profile[1:-1] or "N" in final_profile[1:-1] or "T" in final_profile[1:-1]: | |
outfile.write("\t".join(final_profile)) | |
outfile.write("\n") | |
else: | |
"""need to read in the first line of this file first""" | |
firstLine = open(reference).readline() | |
first_fields = firstLine.split() | |
if "clonal_complex" in first_fields: | |
"""This ignores the clonal complex and ST fields, | |
seems to be working correclty""" | |
first_fields = first_fields[1:-1] | |
num_fields = len(first_fields[1:]) | |
else: | |
num_fields = len(first_fields[1:]) | |
for line in open(reference, "rU"): | |
sum =[] | |
ref_fields=line.split() | |
"""Sum is how many matches that I have. | |
Must equal the number of alleles to receive | |
a sequence type""" | |
for i in range(1,num_fields+1): | |
try: | |
if ref_fields[i]==final_profile[i]: | |
sum.append(i) | |
except: | |
pass | |
if int(len(sum))==int(num_fields): | |
"""if there is a complete match, then add the Sequence type | |
to the beginning of the list""" | |
final_profile.insert(int(num_fields)+1,ref_fields[0]) | |
else: | |
pass | |
#final_profile.insert(int(num_fields)+1,"novel") | |
"""This assumes that the profile is complete""" | |
if len(final_profile) == 9: | |
if final_profile[-1] == "novel" or final_profile[-1] == "truncated" or final_profile[-1] == "missing": | |
pass | |
else: | |
outfile.write("\t".join(final_profile)) | |
outfile.write("\n") | |
elif len(final_profile) == 8: | |
final_profile.insert(int(num_fields)+1,"novel") | |
outfile.write("\t".join(final_profile)) | |
outfile.write("\n") | |
outfile.close() | |
def _perform_blast_workflow(data): | |
genome = "".join(data) | |
if genome.endswith(".fasta"): | |
try: | |
subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % genome, shell=True) | |
except: | |
print("problem found in formatting genome %s" % genome) | |
if ".fasta" in genome: | |
try: | |
devnull = open('/dev/null', 'w') | |
cmd = ["blastn", | |
"-task", "blastn", | |
"-query", "all_loci.fasta", | |
"-db", genome, | |
"-out", "%s_blast.out" % (genome), | |
"-evalue", "0.0001", | |
"-dust", "no", | |
"-outfmt", "6"] | |
subprocess.call(cmd, stdout=devnull, stderr=devnull) | |
devnull.close() | |
except: | |
print("genomes %s cannot be used" % genome) | |
ref_list = [] | |
newfile = open("%s.blast.redux.out" % genome, "w") | |
os.system("sort -gr -k 12,12 %s_blast.out > %s.blast.sorted" % (genome,genome)) | |
for line in open("%s.blast.sorted" % genome): | |
newline = line.strip() | |
fields = line.split() | |
allele_names = fields[0].split("_") | |
if allele_names[0] in ref_list: | |
pass | |
else: | |
ref_list.append(allele_names[0]) | |
newfile.write(line) | |
newfile.close() | |
def run_blast_dev(ref_loci_file,processors,genome_list): | |
curr_dir=os.getcwd() | |
mp_shell(_perform_blast_workflow,genome_list,processors) | |
def make_ST_map(profile): | |
map = {} | |
with open(profile) as my_profile: | |
for line in my_profile: | |
newline = line.strip() | |
fields = newline.split() | |
if fields[0] == "genome": | |
pass | |
else: | |
if len(fields) == 9: | |
map.update({fields[0]:fields[8]}) | |
else: | |
map.update({fields[0]:"unknown"}) | |
return map | |
def main(directory,fastas,trunc,new,processors,prefix): | |
curr_dir = os.getcwd() | |
ap = os.path.abspath(curr_dir) | |
ref_seq_dir=os.path.abspath(directory) | |
reference = "".join(glob.glob(os.path.join(ref_seq_dir, "*.txt"))) | |
ordered = get_gene_order(reference) | |
fastas_path = os.path.abspath(fastas) | |
if os.path.exists("%s/mlst_scratch" % ap): | |
os.system("rm -rf %s/mlst_scratch" % ap) | |
os.makedirs("%s/mlst_scratch" % ap) | |
else: | |
os.makedirs("%s/mlst_scratch" % ap) | |
if os.path.exists("%s/ST_renames" % ap): | |
os.system("rm -rf %s/ST_renames" % ap) | |
os.makedirs("%s/ST_renames" % ap) | |
else: | |
os.makedirs("%s/ST_renames" % ap) | |
tmp_dir = "%s/mlst_scratch" % ap | |
if "clonal_complex" in ordered: | |
ordered=ordered[:-1] | |
"""file_dir is a list of all of your FASTA files""" | |
for afile in glob.glob(os.path.join(fastas,"*fasta")): | |
shutil.copy2(afile,tmp_dir) | |
files_and_temp_names = [] | |
for infile in glob.glob(os.path.join(tmp_dir,"*fasta")): | |
genome = get_seq_name(infile) | |
files_and_temp_names.append(genome) | |
os.chdir(tmp_dir) | |
os.system("cat %s/*fasta > ./all_loci.fasta" % ref_seq_dir) | |
print("blast starting") | |
run_blast_dev("all_loci.fasta",processors,files_and_temp_names) | |
print("blast finished, parsing results") | |
process_results_dev(tmp_dir,ordered,trunc,new,reference,prefix,len(ordered)) | |
st_map = make_ST_map("%s.profile.txt" % prefix) | |
for k,v in st_map.items(): | |
old_name = str(k)+".fasta" | |
new_name = str(k)+"_ST"+str(v)+".fasta" | |
os.system("cp %s/%s %s/ST_renames/%s" % (fastas_path,old_name,ap,new_name)) | |
os.system("mv *.profile.txt %s" % ap) | |
os.chdir(ap) | |
os.system("rm -rf %s/mlst_scratch" % ap) | |
if __name__ == "__main__": | |
usage="usage: %prog [options]" | |
parser = optparse.OptionParser(usage=usage) | |
parser.add_option("-d", "--MLST_dir", dest="directory", | |
help="/path/to/MLST_seq_directory [REQUIRED]", | |
action="callback", callback=test_dir, type="string") | |
parser.add_option("-f", "--fasta dir", dest="fastas", | |
help="/path/to/your_genome_FASTA_dir [REQUIRED]", | |
action="callback", callback=test_dir, type="string") | |
parser.add_option("-t", "--truncated value", dest="trunc", | |
help="gene is considered truncated if below this value [default = 0.95]", | |
default="0.95", type="float") | |
parser.add_option("-n", "--new", dest="new", | |
help="under this value is considered to be a new allele", | |
default="100", type="int") | |
parser.add_option("-p", "--processors", dest="processors", | |
help="Number of processors to use, defaults to 2", | |
default="2", type="int", action="store") | |
parser.add_option("-y", "--prefix", dest="prefix", | |
help="Output prefix for files [REQUIRED]", | |
action="store", type="string") | |
options, args = parser.parse_args() | |
mandatories = ["directory","fastas","prefix"] | |
for m in mandatories: | |
if not getattr(options, m, None): | |
print("\nMust provide %s.\n" %m) | |
parser.print_help() | |
exit(-1) | |
main(options.directory,options.fastas,options.trunc,options.new,options.processors,options.prefix) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment