Skip to content

Instantly share code, notes, and snippets.

@grondilu
Created September 17, 2013 12:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save grondilu/6593733 to your computer and use it in GitHub Desktop.
Save grondilu/6593733 to your computer and use it in GitHub Desktop.
Ukkonen's online algorithm for suffix trees in Perl 6. Translated from: http://marknelson.us/1996/08/01/suffix-trees/
class Suffix {...}
class Edge {...}
class Node { has Int $.suffix_node is rw = -1 }
subset Explicit of Suffix where { .first_char_index > .last_char_index }
subset Implicit of Suffix where { .last_char_index >= .first_char_index }
my @Nodes = Node.new;
my Blob $T;
class Suffix {
has Int $.origin_node is rw;
has Int $.first_char_index is rw;
has Int $.last_char_index is rw;
multi method new( Int $node, Int $start, Int $stop ) {
self.new:
:origin_node($node),
:first_char_index($start),
:last_char_index($stop)
}
method Canonize() {
if self ~~ Implicit {
my Edge $edge = Edge::Find( $.origin_node, $T[ $.first_char_index ] );
my Int $edge_span = $edge.last_char_index - $edge.first_char_index;
while $edge_span <= $.last_char_index - $.first_char_index {
$.first_char_index += $edge_span + 1;
$.origin_node = $edge.end_node;
if $.first_char_index <= $.last_char_index {
$edge = Edge::Find( $edge.end_node, $T[ $.first_char_index ] );
$edge_span = $edge.last_char_index - $edge.first_char_index;
}
}
}
}
}
class Edge {
our @index;
has Int $.first_char_index is rw;
has Int $.last_char_index is rw;
has Int $.start_node is rw;
has Int $.end_node is rw;
our sub List { gather for @index { take .values } }
method Str { $T[$.first_char_index .. $.last_char_index].list.chrs }
method Insert { @index[$.start_node]{$T[$.first_char_index].chr} = self }
method Remove { @index[$.start_node]{$T[$.first_char_index].chr} = Any }
our sub Find( Int $node, Int $c ) { @index[$node]{$c.chr} }
multi method new(
Int $init_first,
Int $init_last,
Int $parent_node
) {
push @Nodes, Node.new;
self.new:
:first_char_index($init_first),
:last_char_index($init_last),
:start_node($parent_node),
:end_node(@Nodes.end);
}
method SplitEdge( Suffix $s ) returns Int {
self.Remove();
my Edge $new_edge .= new:
$.first_char_index,
$.first_char_index + $s.last_char_index - $s.first_char_index,
$s.origin_node;
$new_edge.Insert;
@Nodes[ $new_edge.end_node ].suffix_node = $s.origin_node;
$.first_char_index += $s.last_char_index - $s.first_char_index + 1;
$.start_node = $new_edge.end_node;
self.Insert;
return $new_edge.end_node;
}
}
sub dump_edges(Int $current_n) {
say " Start End Suf First Last String";
for Edge::List() {
printf "%5d %5d %3d %5d %6d %s\n",
.start_node, .end_node, @Nodes[.end_node].suffix_node,
.first_char_index,
.last_char_index,
.Str
}
}
sub AddPrefix( Suffix $active, Int $last_char_index ) {
my Int $parent_node;
my Int $last_parent_node = -1;
loop {
my $edge;
my $parent_node = $active.origin_node;
if $active ~~ Explicit {
$edge = Edge::Find( $active.origin_node, $T[ $last_char_index ] );
last if $edge ~~ :defined;
} else {
$edge = Edge::Find( $active.origin_node, $T[ $active.first_char_index ] );
my Int $span = $active.last_char_index - $active.first_char_index;
last if
$T[ $edge.first_char_index + $span + 1 ] == $T[ $last_char_index ];
$parent_node = $edge.SplitEdge( $active );
}
my Edge $new_edge .= new: $last_char_index, $T-1, $parent_node;
$new_edge.Insert;
if $last_parent_node > 0 {
@Nodes[ $last_parent_node ].suffix_node = $parent_node;
}
$last_parent_node = $parent_node;
if $active.origin_node == 0 {
$active.first_char_index++
} else {
$active.origin_node = @Nodes[ $active.origin_node ].suffix_node;
}
$active.Canonize;
}
@Nodes[ $last_parent_node ].suffix_node = $parent_node
if $last_parent_node > 0;
$active.last_char_index++;
$active.Canonize;
}
=begin validation
my (@CurrentString, @GoodSuffixes);
my @BranchCount = 0;
sub validate {
@GoodSuffixes = 0 xx $T-1;
walk_tree( 0, 0 );
my Int $error = 0;
for 0..^ $T-1 -> $i {
if @GoodSuffixes[ $i ] != 1 {
note "Suffix $i count wrong!";
$error++;
}
}
note 'All suffixes present!' unless $error;
my Int $leaf_count = 0;
my Int $branch_count = 0;
for ^@Nodes -> $i {
if @BranchCount[ $i ] == 0 {
note "Logic error on node $i, not a leaf or internal node!";
} elsif @BranchCount[ $i ] == -1 {
$leaf_count++;
} else {
$branch_count += @BranchCount[ $i ];
}
}
note "Leaf count : $leaf_count, {
$leaf_count == $T ?? "OK" !! "Error!"
}";
note "branch count: $branch_count, {
$branch_count == @Nodes - 1 ?? "OK" !! "Error!"
}";
}
sub walk_tree( Int $start_node, Int $last_char_so_far ) {
my Int $edges = 0;
for ^256 -> $i {
my $edge = Edge::Find( $start_node, $i );
if $edge !~~ :defined {
if @BranchCount[ $edge.start_node ] < 0 {
note "Logic error on node {$edge.start_node}";
}
@BranchCount[ $edge.start_node ]++;
$edges++;
my Int $l = $last_char_so_far;
for $edge.first_char_index .. $edge.last_char_index {
@CurrentString[ $l++ ] = $T[ $_ ];
}
@CurrentString[ $l ] = '\0';
if walk_tree( $edge.end_node, $l) {
if @BranchCount[ $edge.end_node ] > 0 {
note "Logic error on node {$edge.end_node}";
}
@BranchCount[ $edge.end_node ]--;
}
}
}
if $edges == 0 {
note "Suffix : {@CurrentString[^$last_char_so_far]».chr}";
@GoodSuffixes[ @CurrentString - 1 ]++;
note "comparing: ", join " to ", my ($a, $b) = ($T.subbuf: $T-@CurrentString+1).list.chrs, (@CurrentString ...^ '\0').chrs;
note "Comparison failure!" if $a ne $b;
return 1;
} else { return 0 }
}
=end validation
sub MAIN($string?) {
$T = Blob.new: .ords given $string //
prompt (q:to /END/).chomp;
Normally, suffix trees require that the last
character in the input string be unique. If
you don't do this, your tree will contain
suffixes that don't end in leaf nodes. This is
often a useful requirement. You can build a tree
in this program without meeting this requirement,
but the validation code will flag it as being an
invalid tree
Enter string:
END
my Suffix $active .= new: 0, 0, -1;
AddPrefix( $active, $_ ) for ^$T;
dump_edges( $T-1 );
=for validation
given prompt "Would you like to validate the tree? " {
when m:i/^ y/ { validate }
default {}
}
}
# vim: ft=perl6
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment