giuscri/seiferas.py

## seiferas.py
from copy import copy
from itertools import takewhile
from pdb import set_trace as brk

def is_prefix(b, a):
    """Tests if b is a prefix of a."""
    return a[:len(b)] == b

class Node:
    def __init__(self, pathstring, parent=None, children={},
                 shortcut={}, shortcut_for=[], identifier=None):
        self.pathstring = pathstring
        self.parent = parent
        self.children = copy(children)
        self.shortcut = copy(shortcut)
        self.shortcut_for = copy(shortcut_for)
        self.identifier = identifier

    @property
    def delta(self):
        """Returns the delta between the current pathstring
        and the one of its parent."""
        if not self.parent: return ''
        return self.pathstring[len(self.parent.pathstring):]

    def __repr__(self):
        return f'"{self.pathstring}"'

class Tree:
    def __init__(self, text):
        self.root = Node(pathstring='')

        n = Node(pathstring='$', parent=self.root, identifier=-1)
        self.root.children['$'] = n
        n.parent = self.root

        self.current_text = '$'
        self.longest_leaf = n

        for c in reversed(text): self.read(c)

    def leaves_id(self, root=None, result=None):
        """Explores tree from root, returning ids of the leaves."""
        if not root: root = self.root
        if result is None: result = []

        if not root.children:
            result.append(root.identifier)
            return result

        for _, c in root.children.items():
            self.leaves_id(root=c, result=result)

        return result

    def shortcuts(self, root=None, result=None):
        """Returns all the shortcuts saved in the tree"""
        if not result: result = []
        if not root: root = self.root

        result.extend(map(lambda p, n=root: (root, p[1], p[0]),
                          root.shortcut.items()))

        for _, c in root.children.items(): self.shortcuts(root=c, result=result)

        return result

    def read(self, character):
        """Update tree such to encode suffixes of character+current_text"""
        v = self.longest_leaf
        leaf_pathstring = character+self.current_text

        # Start from longest_leaf up to root, until you find a "character-shortcut"
        while not v is self.root and not v.shortcut.get(character):
            v = v.parent

        # If you reached the root, and there's no character-shortcut, parent is the root
        if v is self.root and not v.shortcut.get(character):
            y = v

        else:
            # Else, follow the shortcut
            s = v.shortcut[character]

            # If s is a prefix of the leaf_pathstring, y=s
            if is_prefix(s.pathstring, leaf_pathstring):
                y = s

            else:
                # Else, build y and put it between the parent of s and s
                parent = s.parent
                del parent.children[s.delta[0]]

                l = len(tuple(takewhile(lambda p: p[0]==p[1],
                              zip(s.pathstring, leaf_pathstring))))
                y_pathstring = s.pathstring[:l]
                y = Node(pathstring=y_pathstring)

                y.parent = parent
                parent.children[y.delta[0]] = y

                s.parent = y
                y.children[s.delta[0]] = s

                # y inherits the shortcuts of s
                y.shortcut = copy(s.shortcut)
                for n in y.shortcut.values(): n.shortcut_for.append(y)

                # y may be a better shortcut for nodes that points to s
                for n in s.shortcut_for:
                    if is_prefix(character+n.pathstring, y.pathstring):
                        n.shortcut[character] = y

        # Set the identifier of the list, using negative python notation
        identifier = self.longest_leaf.identifier-1
        leaf = Node(pathstring=leaf_pathstring, identifier=identifier)

        # Append the leaf to y
        leaf.parent = y
        y.children[leaf.delta[0]] = leaf

        # All the nodes from longest_leaf up to the first character-shortcut
        # must shortcut to the new leaf.
        n = self.longest_leaf
        while not n.shortcut.get(character):
            n.shortcut[character] = leaf
            leaf.shortcut_for.append(n)
            if n is self.root: break
            n = n.parent

        self.current_text = leaf.pathstring
        self.longest_leaf = leaf
        return self

def string_matching(P, T, tree=None):
    """Find valid shifts of T to find P.

    It uses a passed suffix tree, or it builds
    a new one using T. It's better to pass
    a previously built tree, such to leverage
    the fact that you can build one suffix
    tree per text and reuse that multiple
    times for different patterns, matching
    the pattern in the text with a complexity
    linear in the size of the current pattern
    - no longer dependent on the size of the text.

    It consumes the pattern, moving through
    the tree using characters of the former.
    If there are no characters left in P,
    return all the id's of the leaves in
    the subtree rooted at the current node
    reached.

    If you must stop before consuming
    the whole pattern, P does not occur
    in T. Return an empty list then.
    """
    if not tree:
        tree = Tree()
        for c in reversed(T): tree.read(c)

    i, n = 0, tree.root.children.get(P[0])
    if not n: return []

    while i < len(P):
        if i < len(n.pathstring) and P[i] != n.pathstring[i]:
            return []
        elif i >= len(n.pathstring):
            n = n.children.get(P[i])
            if not n: return []
        i += 1

    return sorted(map(lambda x: len(T)+1+x, tree.leaves_id(root=n)))

if __name__ == '__main__':
    text = 'bbabab'
    tree = Tree(text)

    print(string_matching('b', text, tree=tree))
    print(string_matching('bb', text, tree=tree))
    print(string_matching('a', text, tree=tree))
    print(string_matching('ab', text, tree=tree))
    print(string_matching('abab', text, tree=tree))
	from copy import copy
	from itertools import takewhile
	from pdb import set_trace as brk

	def is_prefix(b, a):
	"""Tests if b is a prefix of a."""
	return a[:len(b)] == b

	class Node:
	def __init__(self, pathstring, parent=None, children={},
	shortcut={}, shortcut_for=[], identifier=None):
	self.pathstring = pathstring
	self.parent = parent
	self.children = copy(children)
	self.shortcut = copy(shortcut)
	self.shortcut_for = copy(shortcut_for)
	self.identifier = identifier

	@property
	def delta(self):
	"""Returns the delta between the current pathstring
	and the one of its parent."""
	if not self.parent: return ''
	return self.pathstring[len(self.parent.pathstring):]

	def __repr__(self):
	return f'"{self.pathstring}"'

	class Tree:
	def __init__(self, text):
	self.root = Node(pathstring='')

	n = Node(pathstring='$', parent=self.root, identifier=-1)
	self.root.children['$'] = n
	n.parent = self.root

	self.current_text = '$'
	self.longest_leaf = n

	for c in reversed(text): self.read(c)

	def leaves_id(self, root=None, result=None):
	"""Explores tree from root, returning ids of the leaves."""
	if not root: root = self.root
	if result is None: result = []

	if not root.children:
	result.append(root.identifier)
	return result

	for _, c in root.children.items():
	self.leaves_id(root=c, result=result)

	return result

	def shortcuts(self, root=None, result=None):
	"""Returns all the shortcuts saved in the tree"""
	if not result: result = []
	if not root: root = self.root

	result.extend(map(lambda p, n=root: (root, p[1], p[0]),
	root.shortcut.items()))

	for _, c in root.children.items(): self.shortcuts(root=c, result=result)

	return result

	def read(self, character):
	"""Update tree such to encode suffixes of character+current_text"""
	v = self.longest_leaf
	leaf_pathstring = character+self.current_text

	# Start from longest_leaf up to root, until you find a "character-shortcut"
	while not v is self.root and not v.shortcut.get(character):
	v = v.parent

	# If you reached the root, and there's no character-shortcut, parent is the root
	if v is self.root and not v.shortcut.get(character):
	y = v

	else:
	# Else, follow the shortcut
	s = v.shortcut[character]

	# If s is a prefix of the leaf_pathstring, y=s
	if is_prefix(s.pathstring, leaf_pathstring):
	y = s

	else:
	# Else, build y and put it between the parent of s and s
	parent = s.parent
	del parent.children[s.delta[0]]

	l = len(tuple(takewhile(lambda p: p[0]==p[1],
	zip(s.pathstring, leaf_pathstring))))
	y_pathstring = s.pathstring[:l]
	y = Node(pathstring=y_pathstring)

	y.parent = parent
	parent.children[y.delta[0]] = y

	s.parent = y
	y.children[s.delta[0]] = s

	# y inherits the shortcuts of s
	y.shortcut = copy(s.shortcut)
	for n in y.shortcut.values(): n.shortcut_for.append(y)

	# y may be a better shortcut for nodes that points to s
	for n in s.shortcut_for:
	if is_prefix(character+n.pathstring, y.pathstring):
	n.shortcut[character] = y

	# Set the identifier of the list, using negative python notation
	identifier = self.longest_leaf.identifier-1
	leaf = Node(pathstring=leaf_pathstring, identifier=identifier)

	# Append the leaf to y
	leaf.parent = y
	y.children[leaf.delta[0]] = leaf

	# All the nodes from longest_leaf up to the first character-shortcut
	# must shortcut to the new leaf.
	n = self.longest_leaf
	while not n.shortcut.get(character):
	n.shortcut[character] = leaf
	leaf.shortcut_for.append(n)
	if n is self.root: break
	n = n.parent

	self.current_text = leaf.pathstring
	self.longest_leaf = leaf
	return self

	def string_matching(P, T, tree=None):
	"""Find valid shifts of T to find P.

	It uses a passed suffix tree, or it builds
	a new one using T. It's better to pass
	a previously built tree, such to leverage
	the fact that you can build one suffix
	tree per text and reuse that multiple
	times for different patterns, matching
	the pattern in the text with a complexity
	linear in the size of the current pattern
	- no longer dependent on the size of the text.

	It consumes the pattern, moving through
	the tree using characters of the former.
	If there are no characters left in P,
	return all the id's of the leaves in
	the subtree rooted at the current node
	reached.

	If you must stop before consuming
	the whole pattern, P does not occur
	in T. Return an empty list then.
	"""
	if not tree:
	tree = Tree()
	for c in reversed(T): tree.read(c)

	i, n = 0, tree.root.children.get(P[0])
	if not n: return []

	while i < len(P):
	if i < len(n.pathstring) and P[i] != n.pathstring[i]:
	return []
	elif i >= len(n.pathstring):
	n = n.children.get(P[i])
	if not n: return []
	i += 1

	return sorted(map(lambda x: len(T)+1+x, tree.leaves_id(root=n)))

	if __name__ == '__main__':
	text = 'bbabab'
	tree = Tree(text)

	print(string_matching('b', text, tree=tree))
	print(string_matching('bb', text, tree=tree))
	print(string_matching('a', text, tree=tree))
	print(string_matching('ab', text, tree=tree))
	print(string_matching('abab', text, tree=tree))