Skip to content

Instantly share code, notes, and snippets.

@HandcartCactus
Created July 23, 2022 07:27
Show Gist options
  • Save HandcartCactus/21963f83bb7a325915f7be84ca7c64a9 to your computer and use it in GitHub Desktop.
Save HandcartCactus/21963f83bb7a325915f7be84ca7c64a9 to your computer and use it in GitHub Desktop.
Efficiently search a list of words for hamming distances of 1 using a Trie structure
import itertools as it
import random
# https://stackoverflow.com/questions/63584103/passing-a-list-of-strings-to-be-put-into-trie
class TrieNode:
def __init__(self):
self.end = False
self.children = {}
def all_words(self, prefix):
if self.end:
yield prefix
for letter, child in self.children.items():
yield from child.all_words(prefix + letter)
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word):
curr = self.root
for letter in word:
node = curr.children.get(letter)
if not node:
node = TrieNode()
curr.children[letter] = node
curr = node
curr.end = True
def contains(self, seq):
does_contain = True
curr = self.root
for letter in seq:
if letter in curr.children:
node = curr.children.get(letter)
curr = node
else:
does_contain = False
break
return does_contain
def insert_many(self, words):
for word in words:
self.insert(word) # Just loop over self.insert
def find_hamming_distance_1(self, seq):
hd1_strings = []
curr = self.root
# for each character in the lookup word
for idx, letter in enumerate(seq):
# the lookup word before and after this character
prefix = seq[:idx]
suffix_aft_branch = seq[idx + 1 :]
# different characters available at this index
for branch in curr.children:
if branch != letter:
# since hamming distance is 1, rest of word must match after 1 char difference
hd1_str = prefix + branch + suffix_aft_branch
# we can just look into the tree and see if it's there
if self.contains(hd1_str):
hd1_strings.append(hd1_str)
# advance the node in the trie for the next character
node = curr.children.get(letter)
if node:
curr = node
else:
break
return hd1_strings
if __name__ == '__main__':
t = Trie()
wlen = 11
for chars in it.product("ATCG", repeat=wlen):
if random.random() < 1e-2:
word = "".join(chars)
t.insert(word)
print(t.find_hamming_distance_1("A"*wlen))
@HandcartCactus
Copy link
Author

Sample output:

['TAAAAAAA', 'CAAAAAAA', 'GAAAAAAA', 'ATAAAAAA', 'ACAAAAAA', 'AGAAAAAA', 'AATAAAAA', 'AACAAAAA', 'AAGAAAAA', 'AAATAAAA', 'AAACAAAA', 'AAAGAAAA', 'AAAATAAA', 'AAAACAAA', 'AAAAGAAA', 'AAAAATAA', 'AAAAACAA', 'AAAAAGAA', 'AAAAAATA', 'AAAAAACA', 'AAAAAAGA', 'AAAAAAAT', 'AAAAAAAC', 'AAAAAAAG']

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment