-
-
Save jasonsahl/2a232947a3578283f54c to your computer and use it in GitHub Desktop.
Sequences are extracted and concatenated from FASTA genome assemblies. Requires blast+ to be installed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""pull sequences out of a set of genomes, | |
concatenate them for phylogeny""" | |
import optparse | |
import sys | |
import glob | |
try: | |
from Bio import SeqIO | |
except: | |
print("biopython must be installed..exiting..") | |
sys.exit() | |
import os | |
import subprocess | |
def is_tool(name): | |
from distutils.spawn import find_executable | |
return find_executable(name) is not None | |
def test_dir(option, opt_str, value, parser): | |
if os.path.exists(value): | |
setattr(parser.values, option.dest, value) | |
else: | |
print("%s directory cannot be found" % option) | |
sys.exit() | |
def get_seq_name(fasta_in): | |
name = os.path.basename(fasta_in) | |
return name | |
def combine_seqs(dir_path): | |
handle = open("combined.seqs", "w") | |
for infile in glob.glob(os.path.join(dir_path, '*.fasta')): | |
names = get_seq_name(infile) | |
reduced = names.replace('.fasta','') | |
handle.write(">"+str(reduced)+"\n") | |
with open(infile) as my_file: | |
for record in SeqIO.parse(my_file, "fasta"): | |
handle.write(str(record.seq)+"\n") | |
handle.close() | |
def run_blast(seq_path, blast): | |
if blast == "blastn": | |
extension = "*.fasta" | |
else: | |
extension = "*.pep" | |
for infile in glob.glob(os.path.join(seq_path, '%s' % extension)): | |
names = get_seq_name(infile) | |
if "fasta" in extension: | |
reduced = names.replace('.fasta','') | |
else: | |
reduced = names.replace('.pep','') | |
if blast == "blastn": | |
os.system('blastn -task blastn -num_threads 8 -query %s -db combined.seqs -out %s.blast.out -dust no -num_alignments 20000 -outfmt "7 std sseq"' % (infile,reduced)) | |
elif blast == "tblastn": | |
print('tblastn -query %s -db combined.seqs -out %s.blast.out -seg no -num_alignments 20000 -outfmt "7 std sseq"' % (infile, reduced)) | |
os.system('tblastn -query %s -db combined.seqs -out %s.blast.out -seg no -num_alignments 20000 -outfmt "7 std sseq"' % (infile, reduced)) | |
else: | |
print("option not supported..choose from blastn or tblastn") | |
sys.exit() | |
subprocess.check_call("sort -u -k 2,2 %s.blast.out > %s.blast.parsed" % (reduced,reduced), shell=True) | |
parsed_blast_to_seqs("%s.blast.parsed" % reduced, "%s.extracted.seqs" % reduced) | |
def parsed_blast_to_seqs(parsed_file, outfile): | |
handle = open(outfile, "w") | |
with open(parsed_file) as infile: | |
for line in infile: | |
if line.startswith("#"): | |
pass | |
else: | |
fields = line.split() | |
handle.write(">"+str(fields[1])+"\n"+fields[12]+"\n") | |
handle.close() | |
def get_names(fasta_file): | |
names = [] | |
with open(fasta_file) as my_file: | |
for record in SeqIO.parse(my_file, "fasta"): | |
names.append(record.id) | |
return names | |
def split_files(): | |
import glob | |
import os | |
curr_dir=os.getcwd() | |
marker_names = [] | |
for infile in glob.glob(os.path.join(curr_dir, '*.extracted.seqs')): | |
names = get_seq_name(infile) | |
reduced = names.replace('.extracted.seqs','') | |
marker_names.append(reduced) | |
sorted(marker_names) | |
for marker in marker_names: | |
with open("%s.extracted.seqs" % marker) as my_file: | |
"""The markers should now be in order""" | |
for record in SeqIO.parse(my_file, "fasta"): | |
marker_name = record.id | |
handle = open("%s.seqs.fasta" % marker_name, "a") | |
handle.write(">"+str(record.seq)+"\n"+str(record.seq)+"\n") | |
handle.close() | |
def process_fastas(): | |
curr_dir=os.getcwd() | |
for infile in glob.glob(os.path.join(curr_dir, '*seqs.fasta')): | |
handle = open("%s.concat" % infile, "w") | |
names = get_seq_name(infile) | |
handle.write(">"+str(names)+"\n") | |
with open(infile) as my_file: | |
for record in SeqIO.parse(my_file, "fasta"): | |
seqs = [ ] | |
seqs.append(record.seq) | |
handle.write("".join([str(x) for x in seqs])+"\n") | |
handle.close() | |
def main(directory,seqs,blast,muscle): | |
dir_path=os.path.abspath("%s" % directory) | |
seq_path=os.path.abspath("%s" % seqs) | |
result = is_tool(blast) | |
if result is False: | |
print("%s must be in your path..exiting" % blast) | |
sys.exit() | |
#Will skip this step if the file has been created. Does not check to see if it is complete | |
if os.path.isfile("combined.seqs"): | |
pass | |
else: | |
combine_seqs(dir_path) | |
result = is_tool("makeblastdb") | |
if result is False: | |
print('makeblastdb must be in your path..exiting') | |
sys.exit() | |
else: | |
os.system("makeblastdb -in combined.seqs -dbtype nucl > /dev/null 2>&1") | |
run_blast(seq_path,blast) | |
os.system("rm *.blast.out *.blast.parsed") | |
split_files() | |
process_fastas() | |
if muscle == "T": | |
result = is_tool("muscle") | |
if result is False: | |
print("muscle v5 must be in your path..exiting") | |
sys.exit() | |
else: | |
try: | |
os.system("cat *.seqs.fasta.concat > tmp_concatenated") | |
os.system('sed "s/ //g" tmp_concatenated > all_concatenated') | |
os.system("muscle -super5 all_concatenated -output all_concatenated_aligned.fasta") | |
os.system("rm *seqs.fasta* tmp_concatenated all_concatenated") | |
except: | |
print("make sure the muscle version is >5") | |
sys.exit() | |
elif muscle == "F": | |
print("Muscle not run") | |
os.system("rm *seqs.fasta*") | |
else: | |
print("unknown option..muscle not run") | |
os.system("rm *seqs.fasta*") | |
if __name__ == "__main__": | |
usage="usage: %prog [options]" | |
parser = optparse.OptionParser(usage=usage) | |
parser.add_option("-d", "--directory", dest="directory", | |
help="path to genome files in fasta format [REQUIRED]", | |
type="string", action="callback", callback=test_dir) | |
parser.add_option("-s", "--seqs", dest="seqs", | |
help="path to sequence directory [REQUIRED]", | |
type="string", action="callback", callback=test_dir) | |
parser.add_option("-b", "--blast", dest="blast", | |
help="which blast method to use? blastn or tblastn? Defaults to blastn", | |
type="string", action="store", default="blastn") | |
parser.add_option("-m", "--muscle", dest="muscle", | |
help="run muscle on all concatenated? Defaults to F", | |
type="string", action="store", default="F") | |
options, args = parser.parse_args() | |
mandatories = ["directory", "seqs"] | |
for m in mandatories: | |
if not getattr(options, m, None): | |
print("\nMust provide %s.\n" %m) | |
parser.print_help() | |
exit(-1) | |
main(options.directory,options.seqs,options.blast,options.muscle) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment