Skip to content

Instantly share code, notes, and snippets.

@giuscri
Last active July 30, 2017 21:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save giuscri/d4c25c5af18a15d1116f1d27cd5918ca to your computer and use it in GitHub Desktop.
Save giuscri/d4c25c5af18a15d1116f1d27cd5918ca to your computer and use it in GitHub Desktop.
It should implement the Chen-Seiferas suffix tree presented here, http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.632.4&rep=rep1&type=pdf
from copy import copy
from itertools import takewhile
from pdb import set_trace as brk
def is_prefix(b, a):
"""Tests if b is a prefix of a."""
return a[:len(b)] == b
class Node:
def __init__(self, pathstring, parent=None, children={},
shortcut={}, shortcut_for=[], identifier=None):
self.pathstring = pathstring
self.parent = parent
self.children = copy(children)
self.shortcut = copy(shortcut)
self.shortcut_for = copy(shortcut_for)
self.identifier = identifier
@property
def delta(self):
"""Returns the delta between the current pathstring
and the one of its parent."""
if not self.parent: return ''
return self.pathstring[len(self.parent.pathstring):]
def __repr__(self):
return f'"{self.pathstring}"'
class Tree:
def __init__(self, text):
self.root = Node(pathstring='')
n = Node(pathstring='$', parent=self.root, identifier=-1)
self.root.children['$'] = n
n.parent = self.root
self.current_text = '$'
self.longest_leaf = n
for c in reversed(text): self.read(c)
def leaves_id(self, root=None, result=None):
"""Explores tree from root, returning ids of the leaves."""
if not root: root = self.root
if result is None: result = []
if not root.children:
result.append(root.identifier)
return result
for _, c in root.children.items():
self.leaves_id(root=c, result=result)
return result
def shortcuts(self, root=None, result=None):
"""Returns all the shortcuts saved in the tree"""
if not result: result = []
if not root: root = self.root
result.extend(map(lambda p, n=root: (root, p[1], p[0]),
root.shortcut.items()))
for _, c in root.children.items(): self.shortcuts(root=c, result=result)
return result
def read(self, character):
"""Update tree such to encode suffixes of character+current_text"""
v = self.longest_leaf
leaf_pathstring = character+self.current_text
# Start from longest_leaf up to root, until you find a "character-shortcut"
while not v is self.root and not v.shortcut.get(character):
v = v.parent
# If you reached the root, and there's no character-shortcut, parent is the root
if v is self.root and not v.shortcut.get(character):
y = v
else:
# Else, follow the shortcut
s = v.shortcut[character]
# If s is a prefix of the leaf_pathstring, y=s
if is_prefix(s.pathstring, leaf_pathstring):
y = s
else:
# Else, build y and put it between the parent of s and s
parent = s.parent
del parent.children[s.delta[0]]
l = len(tuple(takewhile(lambda p: p[0]==p[1],
zip(s.pathstring, leaf_pathstring))))
y_pathstring = s.pathstring[:l]
y = Node(pathstring=y_pathstring)
y.parent = parent
parent.children[y.delta[0]] = y
s.parent = y
y.children[s.delta[0]] = s
# y inherits the shortcuts of s
y.shortcut = copy(s.shortcut)
for n in y.shortcut.values(): n.shortcut_for.append(y)
# y may be a better shortcut for nodes that points to s
for n in s.shortcut_for:
if is_prefix(character+n.pathstring, y.pathstring):
n.shortcut[character] = y
# Set the identifier of the list, using negative python notation
identifier = self.longest_leaf.identifier-1
leaf = Node(pathstring=leaf_pathstring, identifier=identifier)
# Append the leaf to y
leaf.parent = y
y.children[leaf.delta[0]] = leaf
# All the nodes from longest_leaf up to the first character-shortcut
# must shortcut to the new leaf.
n = self.longest_leaf
while not n.shortcut.get(character):
n.shortcut[character] = leaf
leaf.shortcut_for.append(n)
if n is self.root: break
n = n.parent
self.current_text = leaf.pathstring
self.longest_leaf = leaf
return self
def string_matching(P, T, tree=None):
"""Find valid shifts of T to find P.
It uses a passed suffix tree, or it builds
a new one using T. It's better to pass
a previously built tree, such to leverage
the fact that you can build one suffix
tree per text and reuse that multiple
times for different patterns, matching
the pattern in the text with a complexity
linear in the size of the current pattern
- no longer dependent on the size of the text.
It consumes the pattern, moving through
the tree using characters of the former.
If there are no characters left in P,
return all the id's of the leaves in
the subtree rooted at the current node
reached.
If you must stop before consuming
the whole pattern, P does not occur
in T. Return an empty list then.
"""
if not tree:
tree = Tree()
for c in reversed(T): tree.read(c)
i, n = 0, tree.root.children.get(P[0])
if not n: return []
while i < len(P):
if i < len(n.pathstring) and P[i] != n.pathstring[i]:
return []
elif i >= len(n.pathstring):
n = n.children.get(P[i])
if not n: return []
i += 1
return sorted(map(lambda x: len(T)+1+x, tree.leaves_id(root=n)))
if __name__ == '__main__':
text = 'bbabab'
tree = Tree(text)
print(string_matching('b', text, tree=tree))
print(string_matching('bb', text, tree=tree))
print(string_matching('a', text, tree=tree))
print(string_matching('ab', text, tree=tree))
print(string_matching('abab', text, tree=tree))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment