Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Longest common substrings using generalized suffix trees built with Ukkonen's algorithm.
#!/usr/bin/env python
# -*- coding: utf-8
"""
Search longest common substrings using generalized suffix trees built with Ukkonen's algorithm
Author: Ilya Stepanov <code at ilyastepanov.com>
(c) 2013
"""
from __future__ import print_function
import sys
import re
import argparse
END_OF_STRING = sys.maxint
class SuffixTreeNode:
"""
Suffix tree node class. Actually, it also respresents a tree edge that points to this node.
"""
new_identifier = 0
def __init__(self, start=0, end=END_OF_STRING):
self.identifier = SuffixTreeNode.new_identifier
SuffixTreeNode.new_identifier += 1
# suffix link is required by Ukkonen's algorithm
self.suffix_link = None
# child edges/nodes, each dict key represents the first letter of an edge
self.edges = {}
# stores reference to parent
self.parent = None
# bit vector shows to which strings this node belongs
self.bit_vector = 0
# edge info: start index and end index
self.start = start
self.end = end
def add_child(self, key, start, end):
"""
Create a new child node
Agrs:
key: a char that will be used during active edge searching
start, end: node's edge start and end indices
Returns:
created child node
"""
child = SuffixTreeNode(start=start, end=end)
child.parent = self
self.edges[key] = child
return child
def add_exisiting_node_as_child(self, key, node):
"""
Add an existing node as a child
Args:
key: a char that will be used during active edge searching
node: a node that will be added as a child
"""
node.parent = self
self.edges[key] = node
def get_edge_length(self, current_index):
"""
Get length of an edge that points to this node
Args:
current_index: index of current processing symbol (usefull for leaf nodes that have "infinity" end index)
"""
return min(self.end, current_index + 1) - self.start
def __str__(self):
return 'id=' + str(self.identifier)
class SuffixTree:
"""
Generalized suffix tree
"""
def __init__(self):
# the root node
self.root = SuffixTreeNode()
# all strings are concatenaited together. Tree's nodes stores only indices
self.input_string = ''
# number of strings stored by this tree
self.strings_count = 0
# list of tree leaves
self.leaves = []
def append_string(self, input_string):
"""
Add new string to the suffix tree
"""
start_index = len(self.input_string)
current_string_index = self.strings_count
# each sting should have a unique ending
input_string += '$' + str(current_string_index)
# gathering 'em all together
self.input_string += input_string
self.strings_count += 1
# these 3 variables represents current "active point"
active_node = self.root
active_edge = 0
active_length = 0
# shows how many
remainder = 0
# new leaves appended to tree
new_leaves = []
# main circle
for index in range(start_index, len(self.input_string)):
previous_node = None
remainder += 1
while remainder > 0:
if active_length == 0:
active_edge = index
if self.input_string[active_edge] not in active_node.edges:
# no edge starting with current char, so creating a new leaf node
leaf_node = active_node.add_child(self.input_string[active_edge], index, END_OF_STRING)
# a leaf node will always be leaf node belonging to only one string
# (because each string has different termination)
leaf_node.bit_vector = 1 << current_string_index
new_leaves.append(leaf_node)
# doing suffix link magic
if previous_node is not None:
previous_node.suffix_link = active_node
previous_node = active_node
else:
# ok, we've got an active edge
next_node = active_node.edges[self.input_string[active_edge]]
# walking down through edges (if active_length is bigger than edge length)
next_edge_length = next_node.get_edge_length(index)
if active_length >= next_node.get_edge_length(index):
active_edge += next_edge_length
active_length -= next_edge_length
active_node = next_node
continue
# current edge already contains the suffix we need to insert.
# Increase the active_length and go forward
if self.input_string[next_node.start + active_length] == self.input_string[index]:
active_length += 1
if previous_node is not None:
previous_node.suffix_link = active_node
previous_node = active_node
break
# splitting edge
split_node = active_node.add_child(
self.input_string[active_edge],
next_node.start,
next_node.start + active_length
)
next_node.start += active_length
split_node.add_exisiting_node_as_child(self.input_string[next_node.start], next_node)
leaf_node = split_node.add_child(self.input_string[index], index, END_OF_STRING)
leaf_node.bit_vector = 1 << current_string_index
new_leaves.append(leaf_node)
# suffix link magic again
if previous_node is not None:
previous_node.suffix_link = split_node
previous_node = split_node
remainder -= 1
# follow suffix link (if exists) or go to root
if active_node == self.root and active_length > 0:
active_length -= 1
active_edge = index - remainder + 1
else:
active_node = active_node.suffix_link if active_node.suffix_link is not None else self.root
# update leaves ends from "infinity" to actual string end
for leaf in new_leaves:
leaf.end = len(self.input_string)
self.leaves.extend(new_leaves)
def find_longest_common_substrings(self):
"""
Search longest common substrings in the tree by locating lowest common ancestors what belong to all strings
"""
# all bits are set
success_bit_vector = 2 ** self.strings_count - 1
lowest_common_ancestors = []
# going up to the root
for leaf in self.leaves:
node = leaf
while node.parent is not None:
if node.bit_vector != success_bit_vector:
# updating parent's bit vector
node.parent.bit_vector |= node.bit_vector
node = node.parent
else:
# hey, we've found a lowest common ancestor!
lowest_common_ancestors.append(node)
break
longest_common_substrings = ['']
longest_length = 0
# need to filter the result array and get the longest common strings
for common_ancestor in lowest_common_ancestors:
common_substring = ''
node = common_ancestor
while node.parent is not None:
label = self.input_string[node.start:node.end]
common_substring = label + common_substring
node = node.parent
# remove unique endings ($<number>), we don't need them anymore
common_substring = re.sub(r'(.*?)\$?\d*$', r'\1', common_substring)
if len(common_substring) > longest_length:
longest_length = len(common_substring)
longest_common_substrings = [common_substring]
elif len(common_substring) == longest_length and common_substring not in longest_common_substrings:
longest_common_substrings.append(common_substring)
return longest_common_substrings
def to_graphviz(self, node=None, output=''):
"""
Show the tree as graphviz string. For debugging purposes only
"""
if node is None:
node = self.root
output = 'digraph G {edge [arrowsize=0.4,fontsize=10];'
output +=\
str(node.identifier) + '[label="' +\
str(node.identifier) + '\\n' + '{0:b}'.format(node.bit_vector).zfill(self.strings_count) + '"'
if node.bit_vector == 2 ** self.strings_count - 1:
output += ',style="filled",fillcolor="red"'
output += '];'
if node.suffix_link is not None:
output += str(node.identifier) + '->' + str(node.suffix_link.identifier) + '[style="dashed"];'
for child in node.edges.values():
label = self.input_string[child.start:child.end]
output += str(node.identifier) + '->' + str(child.identifier) + '[label="' + label + '"];'
output = self.to_graphviz(child, output)
if node == self.root:
output += '}'
return output
def __str__(self):
return self.to_graphviz()
def main():
parser = argparse.ArgumentParser(
description='Searching longest common substring. '
'Uses Ukkonen\'s suffix tree algorithm and generalized suffix tree. '
'Written by Ilya Stepanov (c) 2013'
)
parser.add_argument(
'strings',
metavar='STRING',
nargs='*',
help='String for searching',
)
parser.add_argument(
'-f',
'--file',
help='Path for input file. First line should contain number of lines to search in'
)
parser.add_argument(
'-d',
'--debug',
help='Debug mode: shows generalized suffix tree in output (graphviz)',
action="store_true"
)
args = parser.parse_args()
if not args.strings and not args.file:
parser.print_help()
exit()
suffix_tree = SuffixTree()
for s in args.strings:
suffix_tree.append_string(s)
if args.file:
with open(args.file, 'rU') as f:
first_line = f.readline().strip('\r\n')
if first_line.isdigit():
max_lines_count = int(first_line)
else:
max_lines_count = sys.maxint
suffix_tree.append_string(first_line)
for index, line in enumerate(f):
if index >= max_lines_count:
break
suffix_tree.append_string(line.strip('\r\n'))
lcs = suffix_tree.find_longest_common_substrings()
for s in lcs:
print(s)
if args.debug:
print(suffix_tree.to_graphviz())
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment