Skip to content

Instantly share code, notes, and snippets.

@ctb
Last active September 2, 2017 17:34
Show Gist options
  • Save ctb/77c28232076ad2c8c1726919be2b95dc to your computer and use it in GitHub Desktop.
Save ctb/77c28232076ad2c8c1726919be2b95dc to your computer and use it in GitHub Desktop.

reverse indexing foo for sourmash signatures

#! /usr/bin/env python
import argparse
import collections
import sys
sys.path.insert(0, 'kraken/')
sys.path.insert(0, 'revindex/')
import revindex_utils
import lca_json
SCALED=100000
def main():
p = argparse.ArgumentParser()
p.add_argument('-k', '--ksize', default=31, type=int)
p.add_argument('revindex')
p.add_argument('lca_json', nargs='+')
args = p.parse_args()
idx = revindex_utils.HashvalRevindex(args.revindex)
lca_db_list = []
for lca_filename in args.lca_json:
lca_db = lca_json.LCA_Database(lca_filename)
taxfoo, hashval_to_lca, _ = lca_db.get_database(args.ksize, SCALED)
lca_db_list.append((taxfoo, hashval_to_lca))
cnt = collections.Counter()
for k, v in idx.hashval_to_sigids.items():
cnt[k] += len(v)
total = 0
found = 0
unknown = collections.defaultdict(int)
for k, c in cnt.most_common():
if c < 10: # break when we hit things in < 10 samples.
break
total += 1
lca_set = set()
for (_, hashval_to_lca) in lca_db_list:
this_lca = hashval_to_lca.get(k)
if this_lca is not None:
lca_set.add(this_lca)
if not lca_set:
unknown[c] += 1
continue
assert lca_set, lca_set
lca = taxfoo.find_lca(lca_set)
assert(lca), lca
print('hash {}, in {} samples; lineage: {}'.format(k, c, ";".join(taxfoo.get_lineage(lca))))
found += 1
print('found', found, 'of', total)
print('distribution of unknowns:')
print('commonality n')
sofar = 0
for k, cnt in sorted(unknown.items()):
sofar += cnt
print('{} {} {}'.format(k, cnt, sofar))
if __name__ == '__main__':
main()
#! /usr/bin/env python
"""
"""
import sys, os
import sourmash_lib, sourmash_lib.signature
import argparse
from pickle import dump, load
from collections import defaultdict
def traverse_find_sigs(dirnames):
for dirname in dirnames:
for root, dirs, files in os.walk(dirname):
for name in files:
if name.endswith('.sig') or name.endswith('.sbt'):
fullname = os.path.join(root, name)
yield fullname
def main():
p = argparse.ArgumentParser()
p.add_argument('savename')
p.add_argument('sigs', nargs='+')
p.add_argument('-k', '--ksize', default=31, type=int)
p.add_argument('--scaled', default=10000, type=int)
p.add_argument('--traverse-directory', action='store_true')
args = p.parse_args()
# track hashval -> list of signature IDs
hashval_to_sigids = defaultdict(list)
# track hashval -> list of abundances
hashval_to_abunds = defaultdict(list)
# track sigIDs -> (filename, signature md5)
signum = 1
sigid_to_siginfo = {}
# for every minhash in every signature, link it to its NCBI taxonomic ID.
if args.traverse_directory:
inp_files = list(traverse_find_sigs(args.sigs))
else:
inp_files = list(args.sigs)
print('loading signatures & traversing hashes')
bad_input = 0
for n, filename in enumerate(inp_files):
if n % 100 == 0:
print('... loading file #', n, 'of', len(inp_files), end='\r')
try:
sig = sourmash_lib.signature.load_one_signature(filename,
select_ksize=args.ksize)
except (FileNotFoundError, ValueError):
if not args.traverse_directory:
raise
bad_input += 1
continue
# record which file
md5sum = sig.md5sum()
sigid_to_siginfo[signum] = (filename, md5sum)
this_signum = signum
signum += 1
if sig.minhash.scaled < args.scaled:
sig.minhash = sig.minhash.downsample_scaled(args.scaled)
# add hashval -> signature info
mins = sig.minhash.get_mins(with_abundance=True)
for m, abund in mins.items():
hashval_to_sigids[m].append(this_signum)
hashval_to_abunds[m].append(abund)
print('\n...done. {} hashvals, {} signatures'.format(len(hashval_to_sigids), len(sigid_to_siginfo)))
if bad_input:
print('failed to load {} of {} files found'.format(bad_input,
len(inp_files)))
hashval_picklefile = args.savename + '.hashvals'
sig_picklefile = args.savename + '.siginfo'
abund_picklefile = args.savename + '.abunds'
with open(hashval_picklefile, 'wb') as hashval_fp:
dump(hashval_to_sigids, hashval_fp)
with open(sig_picklefile, 'wb') as sig_fp:
dump(sigid_to_siginfo, sig_fp)
with open(abund_picklefile, 'wb') as abund_fp:
dump(hashval_to_abunds, abund_fp)
if __name__ == '__main__':
main()
#! /usr/bin/env python
import argparse
from revindex_utils import HashvalRevindex
import collections
def main():
p = argparse.ArgumentParser()
p.add_argument('loadname')
args = p.parse_args()
idx = HashvalRevindex(args.loadname)
cnt = collections.Counter()
for k, v in idx.hashval_to_sigids.items():
cnt[k] += len(v)
for k, c in cnt.most_common(100):
print(k, c)
if __name__ == '__main__':
main()
#! /usr/bin/env python
from pickle import load
class HashvalRevindex(object):
def __init__(self, savename):
x = load_revindex(savename)
self.hashval_to_sigids, self.sigid_to_siginfo, self.hashval_to_abunds = x
def load_revindex(savename):
hashval_picklefile = savename + '.hashvals'
sig_picklefile = savename + '.siginfo'
abunds_picklefile = savename + '.abunds'
with open(hashval_picklefile, 'rb') as hashval_fp:
hashval_to_sigids = load(hashval_fp)
with open(sig_picklefile, 'rb') as sig_fp:
sigid_to_siginfo = load(sig_fp)
with open(abunds_picklefile, 'rb') as abund_fp:
hashval_to_abunds = load(abund_fp)
return (hashval_to_sigids, sigid_to_siginfo, hashval_to_abunds)
if __name__ == '__main__':
import sys
print('loading', sys.argv[1])
h = HashvalRevindex(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment