Skip to content

Instantly share code, notes, and snippets.

@kldtz
Last active March 17, 2021 07:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kldtz/ab8f26ca7ded84bf84773fe40b9ae828 to your computer and use it in GitHub Desktop.
Save kldtz/ab8f26ca7ded84bf84773fe40b9ae828 to your computer and use it in GitHub Desktop.
Educational implementation of Ukkonen's algorithm
from os import mkdir
import sys
class AuxState:
def __init__(self, root):
self.id = '⊥'
self.root = root
self.suffix_link = None
self.transitions = {'*': self.root}
def transition(self, char):
return self.root
def has_transition(self, char):
return True
class State:
def __init__(self, first, last, id):
self.id = id
# index of first and last char of transition leading up to this state
self.start = first
self.end = last
self.suffix_link = None
self.transitions = {}
def transition(self, char):
return self.transitions[char]
def has_transition(self, char):
return char in self.transitions
class SuffixTree:
def __init__(self, text):
self.state_id = 1
# 1-based indices
self.text = '*' + text
self.root = State(0, 0, 'R')
self.root.suffix_link = AuxState(self.root)
s = self.root
k = 1
for i in range(1, len(self.text)):
s, k = self.update(s, k, i)
s, k = self.canonize(s, k, i)
def update(self, s, k, i):
oldr = self.root
end_point, r = self.test_and_split(s, k, i - 1, self.text[i])
while not end_point:
r.transitions[self.text[i]] = State(i, len(self.text) - 1, self.state_id)
self.state_id += 1
if oldr != self.root:
oldr.suffix_link = r
oldr = r
s, k = self.canonize(s.suffix_link, k, i - 1)
end_point, r = self.test_and_split(s, k, i - 1, self.text[i])
if oldr != self.root:
oldr.suffix_link = s
return s, k
def test_and_split(self, s, k, p, t):
if k <= p:
ss = s.transition(self.text[k])
if t == self.text[ss.start + p - k + 1]:
return True, s
else:
r = State(ss.start, ss.start + p - k, self.state_id)
self.state_id += 1
s.transitions[self.text[ss.start]] = r
ss.start = ss.start + p - k + 1
r.transitions[self.text[ss.start]] = ss
return False, r
else:
if not s.has_transition(t):
return False, s
else:
return True, s
def canonize(self, s, k, p):
if p < k:
return s, k
else:
ss = s.transition(self.text[k])
while ss.end - ss.start <= p - k:
k += ss.end - ss.start + 1
s = ss
if k <= p:
ss = s.transition(self.text[k])
return s, k
def dot(self):
lines = [
'digraph suffixtree {',
'graph [bgcolor=transparent];',
'node [shape=circle, fixedsize=true, width=0.5];',
'rankdir=LR;'
]
self._dot(self.root.suffix_link, lines)
lines.append('}')
return '\n'.join(lines)
def _dot(self, state, lines):
if state.suffix_link:
lines.append(f'\t{state.id} -> {state.suffix_link.id} [color=steelblue];')
for child in state.transitions.values():
lines.append(
f'\t{state.id} -> {child.id} [label="{child.start},{child.end}: {self.text[child.start:child.end + 1]}"];')
self._dot(child, lines)
if __name__ == "__main__":
text = sys.argv[1]
dir = text[:-1]
mkdir(dir)
for i in range(1, len(text) + 1):
st = SuffixTree(text[:i])
with open(f'{dir}/{dir}-{i}.dot', 'w') as fout:
fout.write(st.dot())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment