Skip to content

Instantly share code, notes, and snippets.

@rchikhi
Last active December 26, 2022 15:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rchikhi/ef2a6371c1a6dc2ab53c572ff6890cea to your computer and use it in GitHub Desktop.
Save rchikhi/ef2a6371c1a6dc2ab53c572ff6890cea to your computer and use it in GitHub Desktop.
NTHash rolling nucleotide (kmer) hashing. NThash1 version, pure Python implementation of https://academic.oup.com/bioinformatics/article/32/22/3492/2525588
# nthash1, pure python implementation
#
# (not nthash2, so only for k values < 64)
# ported from https://github.com/luizirber/nthash/
h = {'A': 0x3c8b_fbb3_95c6_0474,
'C': 0x3193_c185_62a0_2b4c,
'G': 0x2032_3ed0_8257_2324,
'T': 0x2955_49f5_4be2_4456,
'N': 0}
rc = {'A': 0x2955_49f5_4be2_4456,
'C': 0x2032_3ed0_8257_2324,
'G': 0x3193_c185_62a0_2b4c,
'T': 0x3c8b_fbb3_95c6_0474,
'N': 0}
#https://gist.github.com/williballenthin/05aadb969e43788593d3
def ROR(x, n, bits = 64):
mask = (2**n) - 1
mask_bits = x & mask
return (x >> (n % bits)) | (mask_bits << ((bits - n) % bits))
def ROL(x, n, bits = 64):
return ROR(x, (bits - n) % bits, bits)
class NTHash:
def __init__(self, seq, k):
assert(k <= len(seq))
self.seq = seq
self.k = k
self.current_idx = 0
self.max_idx = len(seq) - k + 1
self.fh = 0
for (i,v) in enumerate(seq[:k]):
self.fh ^= ROL(h[v], k - i - 1)
self.rh = 0
for (i,v) in enumerate(seq[:k][::-1]):
self.rh ^= ROL(rc[v], k - i - 1)
def __iter__(self):
return self
def __next__(self):
if self.current_idx == self.max_idx:
raise StopIteration
if self.current_idx != 0:
i = self.current_idx - 1
seqi = self.seq[i]
seqk = self.seq[i + self.k]
self.fh = ROL(self.fh,1) ^ ROL(h[seqi],self.k) ^ h[seqk]
self.rh = ROR(self.rh,1) ^ \
ROR(rc[seqi],1) ^ \
ROL(rc[seqk],self.k - 1)
self.current_idx += 1
return min(self.rh, self.fh)
if __name__ == '__main__':
# https://github.com/luizirber/nthash/blob/latest/tests/nthash.rs#L14
seq = "ACGTCGTCAGTCGATGCAGT"
print("hashing",seq,":",[hex(h) for h in NTHash(seq,5)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment