sourmash compute --scaled 10000 -f data/GCF*.fna.gz
sourmash index -k 31 test.sbt GCF*.sig
./build-and-search.py test.sbt data/GCF_000005845.2_ASM584v2_genomic.fna.gz
Last active
September 21, 2017 12:22
-
-
Save ctb/40da5a63a173dfdf4b109ffde7b76874 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
from __future__ import print_function | |
import sys | |
import screed | |
import sourmash_lib | |
KSIZE=31 | |
SBT_filename = sys.argv[1] | |
# get a DNA sequence somehow | |
query_seq = next(iter(screed.open(sys.argv[2]))).sequence | |
print('got {} DNA characters to query'.format(len(query_seq))) | |
# load an SBT; see sourmash_args.py for add'l load stuff, like checking ksize | |
tree = sourmash_lib.load_sbt_index(SBT_filename) | |
# build/load a signature | |
minhash = sourmash_lib.MinHash(ksize=KSIZE, n=0, scaled=10000) | |
minhash.add_sequence(query_seq) | |
query_sig = sourmash_lib.SourmashSignature('', minhash, | |
name='my favorite query') | |
# do the search -- | |
threshold = 0.1 | |
for found_sig, similarity in sourmash_lib.search_sbt_index(tree, query_sig, | |
threshold): | |
print((query_sig.name(), found_sig.name(), similarity,)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment