Skip to content

Instantly share code, notes, and snippets.

@jasonsahl
Last active June 21, 2023 20:40
Show Gist options
  • Save jasonsahl/2a232947a3578283f54c to your computer and use it in GitHub Desktop.
Save jasonsahl/2a232947a3578283f54c to your computer and use it in GitHub Desktop.
Sequences are extracted and concatenated from FASTA genome assemblies. Requires blast+ to be installed.
#!/usr/bin/env python
"""pull sequences out of a set of genomes,
concatenate them for phylogeny"""
import optparse
import sys
import glob
try:
from Bio import SeqIO
except:
print("biopython must be installed..exiting..")
sys.exit()
import os
import subprocess
def is_tool(name):
from distutils.spawn import find_executable
return find_executable(name) is not None
def test_dir(option, opt_str, value, parser):
if os.path.exists(value):
setattr(parser.values, option.dest, value)
else:
print("%s directory cannot be found" % option)
sys.exit()
def get_seq_name(fasta_in):
name = os.path.basename(fasta_in)
return name
def combine_seqs(dir_path):
handle = open("combined.seqs", "w")
for infile in glob.glob(os.path.join(dir_path, '*.fasta')):
names = get_seq_name(infile)
reduced = names.replace('.fasta','')
handle.write(">"+str(reduced)+"\n")
with open(infile) as my_file:
for record in SeqIO.parse(my_file, "fasta"):
handle.write(str(record.seq)+"\n")
handle.close()
def run_blast(seq_path, blast):
if blast == "blastn":
extension = "*.fasta"
else:
extension = "*.pep"
for infile in glob.glob(os.path.join(seq_path, '%s' % extension)):
names = get_seq_name(infile)
if "fasta" in extension:
reduced = names.replace('.fasta','')
else:
reduced = names.replace('.pep','')
if blast == "blastn":
os.system('blastn -task blastn -num_threads 8 -query %s -db combined.seqs -out %s.blast.out -dust no -num_alignments 20000 -outfmt "7 std sseq"' % (infile,reduced))
elif blast == "tblastn":
print('tblastn -query %s -db combined.seqs -out %s.blast.out -seg no -num_alignments 20000 -outfmt "7 std sseq"' % (infile, reduced))
os.system('tblastn -query %s -db combined.seqs -out %s.blast.out -seg no -num_alignments 20000 -outfmt "7 std sseq"' % (infile, reduced))
else:
print("option not supported..choose from blastn or tblastn")
sys.exit()
subprocess.check_call("sort -u -k 2,2 %s.blast.out > %s.blast.parsed" % (reduced,reduced), shell=True)
parsed_blast_to_seqs("%s.blast.parsed" % reduced, "%s.extracted.seqs" % reduced)
def parsed_blast_to_seqs(parsed_file, outfile):
handle = open(outfile, "w")
with open(parsed_file) as infile:
for line in infile:
if line.startswith("#"):
pass
else:
fields = line.split()
handle.write(">"+str(fields[1])+"\n"+fields[12]+"\n")
handle.close()
def get_names(fasta_file):
names = []
with open(fasta_file) as my_file:
for record in SeqIO.parse(my_file, "fasta"):
names.append(record.id)
return names
def split_files():
import glob
import os
curr_dir=os.getcwd()
marker_names = []
for infile in glob.glob(os.path.join(curr_dir, '*.extracted.seqs')):
names = get_seq_name(infile)
reduced = names.replace('.extracted.seqs','')
marker_names.append(reduced)
sorted(marker_names)
for marker in marker_names:
with open("%s.extracted.seqs" % marker) as my_file:
"""The markers should now be in order"""
for record in SeqIO.parse(my_file, "fasta"):
marker_name = record.id
handle = open("%s.seqs.fasta" % marker_name, "a")
handle.write(">"+str(record.seq)+"\n"+str(record.seq)+"\n")
handle.close()
def process_fastas():
curr_dir=os.getcwd()
for infile in glob.glob(os.path.join(curr_dir, '*seqs.fasta')):
handle = open("%s.concat" % infile, "w")
names = get_seq_name(infile)
handle.write(">"+str(names)+"\n")
with open(infile) as my_file:
for record in SeqIO.parse(my_file, "fasta"):
seqs = [ ]
seqs.append(record.seq)
handle.write("".join([str(x) for x in seqs])+"\n")
handle.close()
def main(directory,seqs,blast,muscle):
dir_path=os.path.abspath("%s" % directory)
seq_path=os.path.abspath("%s" % seqs)
result = is_tool(blast)
if result is False:
print("%s must be in your path..exiting" % blast)
sys.exit()
#Will skip this step if the file has been created. Does not check to see if it is complete
if os.path.isfile("combined.seqs"):
pass
else:
combine_seqs(dir_path)
result = is_tool("makeblastdb")
if result is False:
print('makeblastdb must be in your path..exiting')
sys.exit()
else:
os.system("makeblastdb -in combined.seqs -dbtype nucl > /dev/null 2>&1")
run_blast(seq_path,blast)
os.system("rm *.blast.out *.blast.parsed")
split_files()
process_fastas()
if muscle == "T":
result = is_tool("muscle")
if result is False:
print("muscle v5 must be in your path..exiting")
sys.exit()
else:
try:
os.system("cat *.seqs.fasta.concat > tmp_concatenated")
os.system('sed "s/ //g" tmp_concatenated > all_concatenated')
os.system("muscle -super5 all_concatenated -output all_concatenated_aligned.fasta")
os.system("rm *seqs.fasta* tmp_concatenated all_concatenated")
except:
print("make sure the muscle version is >5")
sys.exit()
elif muscle == "F":
print("Muscle not run")
os.system("rm *seqs.fasta*")
else:
print("unknown option..muscle not run")
os.system("rm *seqs.fasta*")
if __name__ == "__main__":
usage="usage: %prog [options]"
parser = optparse.OptionParser(usage=usage)
parser.add_option("-d", "--directory", dest="directory",
help="path to genome files in fasta format [REQUIRED]",
type="string", action="callback", callback=test_dir)
parser.add_option("-s", "--seqs", dest="seqs",
help="path to sequence directory [REQUIRED]",
type="string", action="callback", callback=test_dir)
parser.add_option("-b", "--blast", dest="blast",
help="which blast method to use? blastn or tblastn? Defaults to blastn",
type="string", action="store", default="blastn")
parser.add_option("-m", "--muscle", dest="muscle",
help="run muscle on all concatenated? Defaults to F",
type="string", action="store", default="F")
options, args = parser.parse_args()
mandatories = ["directory", "seqs"]
for m in mandatories:
if not getattr(options, m, None):
print("\nMust provide %s.\n" %m)
parser.print_help()
exit(-1)
main(options.directory,options.seqs,options.blast,options.muscle)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment