Skip to content

Instantly share code, notes, and snippets.

@ctb
Created November 2, 2020 03:05
Show Gist options
  • Save ctb/763fa60e4357461d1417ccb915cdcef4 to your computer and use it in GitHub Desktop.
Save ctb/763fa60e4357461d1417ccb915cdcef4 to your computer and use it in GitHub Desktop.
a simple prefetch script that searches large sourmash databases for all possible matches, and then saves them
#! /usr/bin/env python
import sys
import argparse
import copy
import sourmash
from sourmash import sourmash_args
from sourmash.logging import notify, error
def main():
p = argparse.ArgumentParser()
p.add_argument('--db', nargs='+', action='append',
help='one or more LCA databases to use')
p.add_argument('--query', nargs='*', default=[], action='append',
help='one or more signature files to use as queries')
p.add_argument('--save-matches', required=True)
p.add_argument('--output-unassigned')
p.add_argument('--threshold-bp', type=float, default=1e5)
args = p.parse_args()
# flatten --db and --query lists
args.db = [item for sublist in args.db for item in sublist]
args.query = [item for sublist in args.query for item in sublist]
# build one big query:
query_sigs = []
for query_file in args.query:
query_sigs.extend(sourmash_args.load_file_as_signatures(query_file,
ksize=31))
mh = query_sigs[0].minhash
for query_sig in query_sigs[1:]:
mh += query_sig.minhash
unident_mh = copy.copy(mh)
notify(f'Loaded {len(mh.hashes)} hashes from {len(query_sigs)} query signatures.')
# iterate over signatures in one at a time
keep = []
n = 0
for db in args.db:
for sig in sourmash_args.load_file_as_signatures(db, ksize=31):
n += 1
common = mh.count_common(sig.minhash)
if common:
# check scaled...
if common * mh.scaled >= args.threshold_bp:
keep.append(sig)
unident_mh.remove_many(sig.minhash.hashes)
if n % 10 == 0:
notify(f'{n} searched, {len(keep)} matches.', end='\r')
notify(f'{n} searched, {len(keep)} matches.')
if keep and args.save_matches:
notify('saving all matches to "{}"', args.save_matches)
with sourmash_args.FileOutput(args.save_matches, 'wt') as fp:
sourmash.save_signatures(keep, fp)
if args.output_unassigned:
notify('saving unidentified hashes to "{}"', args.output_unassigned)
ss = sourmash.SourmashSignature(unident_mh)
with open(args.output_unassigned, 'wt') as fp:
sourmash.save_signatures([ ss ], fp)
return 0
if __name__ == '__main__':
sys.exit(main())
@nmb85
Copy link

nmb85 commented Nov 2, 2020

Thanks for this! I have been meaning to get involved on this open-source project again, but have been pulled away by exciting but demanding developments at work. This is really inspiring me to try to make some time to return to learning github and using this little gist to solve a problem of my own. :)

@ctb
Copy link
Author

ctb commented Nov 2, 2020

👍

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment