Skip to content

Instantly share code, notes, and snippets.

@bogemad
Created June 29, 2021 23:36
Show Gist options
  • Save bogemad/84488444f6d776fbd08defb64db445de to your computer and use it in GitHub Desktop.
Save bogemad/84488444f6d776fbd08defb64db445de to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sys, os, re
from Bio import SeqIO
base_dir = sys.argv[1]
keyword = sys.argv[2]
outdir = sys.argv[3]
min_len = int(float(sys.argv[4])*1000000)
if not os.path.isdir(outdir):
os.mkdir(outdir)
for sample_name in os.listdir(os.path.join(base_dir, 'raw_assemblies')):
blob_table_file = os.path.join(base_dir, 'qc_plots', 'blobtools', sample_name, '{0}.{0}.blobDB.table.txt'.format(sample_name))
fasta_file = os.path.join(base_dir, 'raw_assemblies', sample_name, 'scaffolds.fasta')
if not os.path.isfile(fasta_file):
print("No assembly found for {}".format(sample_name))
continue
recs = SeqIO.to_dict(SeqIO.parse(fasta_file, 'fasta'))
new_recs = []
with open(blob_table_file) as blob_table_handle:
for line in blob_table_handle:
if line.startswith('#'):
continue
data = line.strip().split('\t')
if re.search(r'{}'.format(keyword), data[8]):
new_recs.append(recs[data[0]])
if len(new_recs) == 0:
print("{1}: No sequences matching {0} found".format(keyword, sample_name))
elif sum(len(rec) for rec in new_recs) < int(min_len):
print("{}: Sequences matching {} sum to length less than minimum length: {} < {} bp".format(sample_name, keyword, sum(len(rec) for rec in new_recs), min_len))
else:
print("{} ok. Saving to {}".format(sample_name, os.path.join(outdir, '{}.purified_sequence.fasta'.format(sample_name))))
SeqIO.write(new_recs, os.path.join(outdir, '{}.purified_sequence.fasta'.format(sample_name)), 'fasta')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment