Last active
July 30, 2017 21:19
-
-
Save giuscri/d4c25c5af18a15d1116f1d27cd5918ca to your computer and use it in GitHub Desktop.
It should implement the Chen-Seiferas suffix tree presented here, http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.632.4&rep=rep1&type=pdf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from copy import copy | |
from itertools import takewhile | |
from pdb import set_trace as brk | |
def is_prefix(b, a): | |
"""Tests if b is a prefix of a.""" | |
return a[:len(b)] == b | |
class Node: | |
def __init__(self, pathstring, parent=None, children={}, | |
shortcut={}, shortcut_for=[], identifier=None): | |
self.pathstring = pathstring | |
self.parent = parent | |
self.children = copy(children) | |
self.shortcut = copy(shortcut) | |
self.shortcut_for = copy(shortcut_for) | |
self.identifier = identifier | |
@property | |
def delta(self): | |
"""Returns the delta between the current pathstring | |
and the one of its parent.""" | |
if not self.parent: return '' | |
return self.pathstring[len(self.parent.pathstring):] | |
def __repr__(self): | |
return f'"{self.pathstring}"' | |
class Tree: | |
def __init__(self, text): | |
self.root = Node(pathstring='') | |
n = Node(pathstring='$', parent=self.root, identifier=-1) | |
self.root.children['$'] = n | |
n.parent = self.root | |
self.current_text = '$' | |
self.longest_leaf = n | |
for c in reversed(text): self.read(c) | |
def leaves_id(self, root=None, result=None): | |
"""Explores tree from root, returning ids of the leaves.""" | |
if not root: root = self.root | |
if result is None: result = [] | |
if not root.children: | |
result.append(root.identifier) | |
return result | |
for _, c in root.children.items(): | |
self.leaves_id(root=c, result=result) | |
return result | |
def shortcuts(self, root=None, result=None): | |
"""Returns all the shortcuts saved in the tree""" | |
if not result: result = [] | |
if not root: root = self.root | |
result.extend(map(lambda p, n=root: (root, p[1], p[0]), | |
root.shortcut.items())) | |
for _, c in root.children.items(): self.shortcuts(root=c, result=result) | |
return result | |
def read(self, character): | |
"""Update tree such to encode suffixes of character+current_text""" | |
v = self.longest_leaf | |
leaf_pathstring = character+self.current_text | |
# Start from longest_leaf up to root, until you find a "character-shortcut" | |
while not v is self.root and not v.shortcut.get(character): | |
v = v.parent | |
# If you reached the root, and there's no character-shortcut, parent is the root | |
if v is self.root and not v.shortcut.get(character): | |
y = v | |
else: | |
# Else, follow the shortcut | |
s = v.shortcut[character] | |
# If s is a prefix of the leaf_pathstring, y=s | |
if is_prefix(s.pathstring, leaf_pathstring): | |
y = s | |
else: | |
# Else, build y and put it between the parent of s and s | |
parent = s.parent | |
del parent.children[s.delta[0]] | |
l = len(tuple(takewhile(lambda p: p[0]==p[1], | |
zip(s.pathstring, leaf_pathstring)))) | |
y_pathstring = s.pathstring[:l] | |
y = Node(pathstring=y_pathstring) | |
y.parent = parent | |
parent.children[y.delta[0]] = y | |
s.parent = y | |
y.children[s.delta[0]] = s | |
# y inherits the shortcuts of s | |
y.shortcut = copy(s.shortcut) | |
for n in y.shortcut.values(): n.shortcut_for.append(y) | |
# y may be a better shortcut for nodes that points to s | |
for n in s.shortcut_for: | |
if is_prefix(character+n.pathstring, y.pathstring): | |
n.shortcut[character] = y | |
# Set the identifier of the list, using negative python notation | |
identifier = self.longest_leaf.identifier-1 | |
leaf = Node(pathstring=leaf_pathstring, identifier=identifier) | |
# Append the leaf to y | |
leaf.parent = y | |
y.children[leaf.delta[0]] = leaf | |
# All the nodes from longest_leaf up to the first character-shortcut | |
# must shortcut to the new leaf. | |
n = self.longest_leaf | |
while not n.shortcut.get(character): | |
n.shortcut[character] = leaf | |
leaf.shortcut_for.append(n) | |
if n is self.root: break | |
n = n.parent | |
self.current_text = leaf.pathstring | |
self.longest_leaf = leaf | |
return self | |
def string_matching(P, T, tree=None): | |
"""Find valid shifts of T to find P. | |
It uses a passed suffix tree, or it builds | |
a new one using T. It's better to pass | |
a previously built tree, such to leverage | |
the fact that you can build one suffix | |
tree per text and reuse that multiple | |
times for different patterns, matching | |
the pattern in the text with a complexity | |
linear in the size of the current pattern | |
- no longer dependent on the size of the text. | |
It consumes the pattern, moving through | |
the tree using characters of the former. | |
If there are no characters left in P, | |
return all the id's of the leaves in | |
the subtree rooted at the current node | |
reached. | |
If you must stop before consuming | |
the whole pattern, P does not occur | |
in T. Return an empty list then. | |
""" | |
if not tree: | |
tree = Tree() | |
for c in reversed(T): tree.read(c) | |
i, n = 0, tree.root.children.get(P[0]) | |
if not n: return [] | |
while i < len(P): | |
if i < len(n.pathstring) and P[i] != n.pathstring[i]: | |
return [] | |
elif i >= len(n.pathstring): | |
n = n.children.get(P[i]) | |
if not n: return [] | |
i += 1 | |
return sorted(map(lambda x: len(T)+1+x, tree.leaves_id(root=n))) | |
if __name__ == '__main__': | |
text = 'bbabab' | |
tree = Tree(text) | |
print(string_matching('b', text, tree=tree)) | |
print(string_matching('bb', text, tree=tree)) | |
print(string_matching('a', text, tree=tree)) | |
print(string_matching('ab', text, tree=tree)) | |
print(string_matching('abab', text, tree=tree)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment