Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Search longest common substrings using generalized suffix trees built with Ukkonen's algorithm, written in Python 2.7
#!/usr/bin/env python
# -*- coding: utf-8
Search longest common substrings using generalized suffix trees built with Ukkonen's algorithm
Author: Ilya Stepanov <code at>
(c) 2013
from __future__ import print_function
import sys
import re
import argparse
END_OF_STRING = sys.maxint
class SuffixTreeNode:
Suffix tree node class. Actually, it also respresents a tree edge that points to this node.
new_identifier = 0
def __init__(self, start=0, end=END_OF_STRING):
self.identifier = SuffixTreeNode.new_identifier
SuffixTreeNode.new_identifier += 1
# suffix link is required by Ukkonen's algorithm
self.suffix_link = None
# child edges/nodes, each dict key represents the first letter of an edge
self.edges = {}
# stores reference to parent
self.parent = None
# bit vector shows to which strings this node belongs
self.bit_vector = 0
# edge info: start index and end index
self.start = start
self.end = end
def add_child(self, key, start, end):
Create a new child node
key: a char that will be used during active edge searching
start, end: node's edge start and end indices
created child node
child = SuffixTreeNode(start=start, end=end)
child.parent = self
self.edges[key] = child
return child
def add_exisiting_node_as_child(self, key, node):
Add an existing node as a child
key: a char that will be used during active edge searching
node: a node that will be added as a child
node.parent = self
self.edges[key] = node
def get_edge_length(self, current_index):
Get length of an edge that points to this node
current_index: index of current processing symbol (usefull for leaf nodes that have "infinity" end index)
return min(self.end, current_index + 1) - self.start
def __str__(self):
return 'id=' + str(self.identifier)
class SuffixTree:
Generalized suffix tree
def __init__(self):
# the root node
self.root = SuffixTreeNode()
# all strings are concatenaited together. Tree's nodes stores only indices
self.input_string = ''
# number of strings stored by this tree
self.strings_count = 0
# list of tree leaves
self.leaves = []
def append_string(self, input_string):
Add new string to the suffix tree
start_index = len(self.input_string)
current_string_index = self.strings_count
# each sting should have a unique ending
input_string += '$' + str(current_string_index)
# gathering 'em all together
self.input_string += input_string
self.strings_count += 1
# these 3 variables represents current "active point"
active_node = self.root
active_edge = 0
active_length = 0
# shows how many
remainder = 0
# new leaves appended to tree
new_leaves = []
# main circle
for index in range(start_index, len(self.input_string)):
previous_node = None
remainder += 1
while remainder > 0:
if active_length == 0:
active_edge = index
if self.input_string[active_edge] not in active_node.edges:
# no edge starting with current char, so creating a new leaf node
leaf_node = active_node.add_child(self.input_string[active_edge], index, END_OF_STRING)
# a leaf node will always be leaf node belonging to only one string
# (because each string has different termination)
leaf_node.bit_vector = 1 << current_string_index
# doing suffix link magic
if previous_node is not None:
previous_node.suffix_link = active_node
previous_node = active_node
# ok, we've got an active edge
next_node = active_node.edges[self.input_string[active_edge]]
# walking down through edges (if active_length is bigger than edge length)
next_edge_length = next_node.get_edge_length(index)
if active_length >= next_node.get_edge_length(index):
active_edge += next_edge_length
active_length -= next_edge_length
active_node = next_node
# current edge already contains the suffix we need to insert.
# Increase the active_length and go forward
if self.input_string[next_node.start + active_length] == self.input_string[index]:
active_length += 1
if previous_node is not None:
previous_node.suffix_link = active_node
previous_node = active_node
# splitting edge
split_node = active_node.add_child(
next_node.start + active_length
next_node.start += active_length
split_node.add_exisiting_node_as_child(self.input_string[next_node.start], next_node)
leaf_node = split_node.add_child(self.input_string[index], index, END_OF_STRING)
leaf_node.bit_vector = 1 << current_string_index
# suffix link magic again
if previous_node is not None:
previous_node.suffix_link = split_node
previous_node = split_node
remainder -= 1
# follow suffix link (if exists) or go to root
if active_node == self.root and active_length > 0:
active_length -= 1
active_edge = index - remainder + 1
active_node = active_node.suffix_link if active_node.suffix_link is not None else self.root
# update leaves ends from "infinity" to actual string end
for leaf in new_leaves:
leaf.end = len(self.input_string)
def find_longest_common_substrings(self):
Search longest common substrings in the tree by locating lowest common ancestors what belong to all strings
# all bits are set
success_bit_vector = 2 ** self.strings_count - 1
lowest_common_ancestors = []
# going up to the root
for leaf in self.leaves:
node = leaf
while node.parent is not None:
if node.bit_vector != success_bit_vector:
# updating parent's bit vector
node.parent.bit_vector |= node.bit_vector
node = node.parent
# hey, we've found a lowest common ancestor!
longest_common_substrings = ['']
longest_length = 0
# need to filter the result array and get the longest common strings
for common_ancestor in lowest_common_ancestors:
common_substring = ''
node = common_ancestor
while node.parent is not None:
label = self.input_string[node.start:node.end]
common_substring = label + common_substring
node = node.parent
# remove unique endings ($<number>), we don't need them anymore
common_substring = re.sub(r'(.*?)\$?\d*$', r'\1', common_substring)
if len(common_substring) > longest_length:
longest_length = len(common_substring)
longest_common_substrings = [common_substring]
elif len(common_substring) == longest_length and common_substring not in longest_common_substrings:
return longest_common_substrings
def to_graphviz(self, node=None, output=''):
Show the tree as graphviz string. For debugging purposes only
if node is None:
node = self.root
output = 'digraph G {edge [arrowsize=0.4,fontsize=10];'
output +=\
str(node.identifier) + '[label="' +\
str(node.identifier) + '\\n' + '{0:b}'.format(node.bit_vector).zfill(self.strings_count) + '"'
if node.bit_vector == 2 ** self.strings_count - 1:
output += ',style="filled",fillcolor="red"'
output += '];'
if node.suffix_link is not None:
output += str(node.identifier) + '->' + str(node.suffix_link.identifier) + '[style="dashed"];'
for child in node.edges.values():
label = self.input_string[child.start:child.end]
output += str(node.identifier) + '->' + str(child.identifier) + '[label="' + label + '"];'
output = self.to_graphviz(child, output)
if node == self.root:
output += '}'
return output
def __str__(self):
return self.to_graphviz()
def main():
parser = argparse.ArgumentParser(
description='Searching longest common substring. '
'Uses Ukkonen\'s suffix tree algorithm and generalized suffix tree. '
'Written by Ilya Stepanov (c) 2013'
help='String for searching',
help='Path for input file. First line should contain number of lines to search in'
help='Debug mode: shows generalized suffix tree in output (graphviz)',
args = parser.parse_args()
if not args.strings and not args.file:
suffix_tree = SuffixTree()
for s in args.strings:
if args.file:
with open(args.file, 'rU') as f:
first_line = f.readline().strip('\r\n')
if first_line.isdigit():
max_lines_count = int(first_line)
max_lines_count = sys.maxint
for index, line in enumerate(f):
if index >= max_lines_count:
lcs = suffix_tree.find_longest_common_substrings()
for s in lcs:
if args.debug:
if __name__ == "__main__":
Copy link

mohsinkhalid122 commented May 14, 2017

Hi istepanov, can we print all the substrings using your code
string1 = 'TeleTableToTree'
stirng2 = 'TableIsTree'

the code should print: Table Tree
not just 'Table'

Copy link

pombredanne commented May 8, 2019

@istepanov what would be the license for this fine code?

Copy link

christygo commented Aug 7, 2020

Hi; I am new to coding and was looking for some help. Where in the code do I type in the two strings that I want to compare?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment