Created
September 17, 2013 12:33
-
-
Save grondilu/6593733 to your computer and use it in GitHub Desktop.
Ukkonen's online algorithm for suffix trees in Perl 6. Translated from: http://marknelson.us/1996/08/01/suffix-trees/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Suffix {...} | |
class Edge {...} | |
class Node { has Int $.suffix_node is rw = -1 } | |
subset Explicit of Suffix where { .first_char_index > .last_char_index } | |
subset Implicit of Suffix where { .last_char_index >= .first_char_index } | |
my @Nodes = Node.new; | |
my Blob $T; | |
class Suffix { | |
has Int $.origin_node is rw; | |
has Int $.first_char_index is rw; | |
has Int $.last_char_index is rw; | |
multi method new( Int $node, Int $start, Int $stop ) { | |
self.new: | |
:origin_node($node), | |
:first_char_index($start), | |
:last_char_index($stop) | |
} | |
method Canonize() { | |
if self ~~ Implicit { | |
my Edge $edge = Edge::Find( $.origin_node, $T[ $.first_char_index ] ); | |
my Int $edge_span = $edge.last_char_index - $edge.first_char_index; | |
while $edge_span <= $.last_char_index - $.first_char_index { | |
$.first_char_index += $edge_span + 1; | |
$.origin_node = $edge.end_node; | |
if $.first_char_index <= $.last_char_index { | |
$edge = Edge::Find( $edge.end_node, $T[ $.first_char_index ] ); | |
$edge_span = $edge.last_char_index - $edge.first_char_index; | |
} | |
} | |
} | |
} | |
} | |
class Edge { | |
our @index; | |
has Int $.first_char_index is rw; | |
has Int $.last_char_index is rw; | |
has Int $.start_node is rw; | |
has Int $.end_node is rw; | |
our sub List { gather for @index { take .values } } | |
method Str { $T[$.first_char_index .. $.last_char_index].list.chrs } | |
method Insert { @index[$.start_node]{$T[$.first_char_index].chr} = self } | |
method Remove { @index[$.start_node]{$T[$.first_char_index].chr} = Any } | |
our sub Find( Int $node, Int $c ) { @index[$node]{$c.chr} } | |
multi method new( | |
Int $init_first, | |
Int $init_last, | |
Int $parent_node | |
) { | |
push @Nodes, Node.new; | |
self.new: | |
:first_char_index($init_first), | |
:last_char_index($init_last), | |
:start_node($parent_node), | |
:end_node(@Nodes.end); | |
} | |
method SplitEdge( Suffix $s ) returns Int { | |
self.Remove(); | |
my Edge $new_edge .= new: | |
$.first_char_index, | |
$.first_char_index + $s.last_char_index - $s.first_char_index, | |
$s.origin_node; | |
$new_edge.Insert; | |
@Nodes[ $new_edge.end_node ].suffix_node = $s.origin_node; | |
$.first_char_index += $s.last_char_index - $s.first_char_index + 1; | |
$.start_node = $new_edge.end_node; | |
self.Insert; | |
return $new_edge.end_node; | |
} | |
} | |
sub dump_edges(Int $current_n) { | |
say " Start End Suf First Last String"; | |
for Edge::List() { | |
printf "%5d %5d %3d %5d %6d %s\n", | |
.start_node, .end_node, @Nodes[.end_node].suffix_node, | |
.first_char_index, | |
.last_char_index, | |
.Str | |
} | |
} | |
sub AddPrefix( Suffix $active, Int $last_char_index ) { | |
my Int $parent_node; | |
my Int $last_parent_node = -1; | |
loop { | |
my $edge; | |
my $parent_node = $active.origin_node; | |
if $active ~~ Explicit { | |
$edge = Edge::Find( $active.origin_node, $T[ $last_char_index ] ); | |
last if $edge ~~ :defined; | |
} else { | |
$edge = Edge::Find( $active.origin_node, $T[ $active.first_char_index ] ); | |
my Int $span = $active.last_char_index - $active.first_char_index; | |
last if | |
$T[ $edge.first_char_index + $span + 1 ] == $T[ $last_char_index ]; | |
$parent_node = $edge.SplitEdge( $active ); | |
} | |
my Edge $new_edge .= new: $last_char_index, $T-1, $parent_node; | |
$new_edge.Insert; | |
if $last_parent_node > 0 { | |
@Nodes[ $last_parent_node ].suffix_node = $parent_node; | |
} | |
$last_parent_node = $parent_node; | |
if $active.origin_node == 0 { | |
$active.first_char_index++ | |
} else { | |
$active.origin_node = @Nodes[ $active.origin_node ].suffix_node; | |
} | |
$active.Canonize; | |
} | |
@Nodes[ $last_parent_node ].suffix_node = $parent_node | |
if $last_parent_node > 0; | |
$active.last_char_index++; | |
$active.Canonize; | |
} | |
=begin validation | |
my (@CurrentString, @GoodSuffixes); | |
my @BranchCount = 0; | |
sub validate { | |
@GoodSuffixes = 0 xx $T-1; | |
walk_tree( 0, 0 ); | |
my Int $error = 0; | |
for 0..^ $T-1 -> $i { | |
if @GoodSuffixes[ $i ] != 1 { | |
note "Suffix $i count wrong!"; | |
$error++; | |
} | |
} | |
note 'All suffixes present!' unless $error; | |
my Int $leaf_count = 0; | |
my Int $branch_count = 0; | |
for ^@Nodes -> $i { | |
if @BranchCount[ $i ] == 0 { | |
note "Logic error on node $i, not a leaf or internal node!"; | |
} elsif @BranchCount[ $i ] == -1 { | |
$leaf_count++; | |
} else { | |
$branch_count += @BranchCount[ $i ]; | |
} | |
} | |
note "Leaf count : $leaf_count, { | |
$leaf_count == $T ?? "OK" !! "Error!" | |
}"; | |
note "branch count: $branch_count, { | |
$branch_count == @Nodes - 1 ?? "OK" !! "Error!" | |
}"; | |
} | |
sub walk_tree( Int $start_node, Int $last_char_so_far ) { | |
my Int $edges = 0; | |
for ^256 -> $i { | |
my $edge = Edge::Find( $start_node, $i ); | |
if $edge !~~ :defined { | |
if @BranchCount[ $edge.start_node ] < 0 { | |
note "Logic error on node {$edge.start_node}"; | |
} | |
@BranchCount[ $edge.start_node ]++; | |
$edges++; | |
my Int $l = $last_char_so_far; | |
for $edge.first_char_index .. $edge.last_char_index { | |
@CurrentString[ $l++ ] = $T[ $_ ]; | |
} | |
@CurrentString[ $l ] = '\0'; | |
if walk_tree( $edge.end_node, $l) { | |
if @BranchCount[ $edge.end_node ] > 0 { | |
note "Logic error on node {$edge.end_node}"; | |
} | |
@BranchCount[ $edge.end_node ]--; | |
} | |
} | |
} | |
if $edges == 0 { | |
note "Suffix : {@CurrentString[^$last_char_so_far]».chr}"; | |
@GoodSuffixes[ @CurrentString - 1 ]++; | |
note "comparing: ", join " to ", my ($a, $b) = ($T.subbuf: $T-@CurrentString+1).list.chrs, (@CurrentString ...^ '\0').chrs; | |
note "Comparison failure!" if $a ne $b; | |
return 1; | |
} else { return 0 } | |
} | |
=end validation | |
sub MAIN($string?) { | |
$T = Blob.new: .ords given $string // | |
prompt (q:to /END/).chomp; | |
Normally, suffix trees require that the last | |
character in the input string be unique. If | |
you don't do this, your tree will contain | |
suffixes that don't end in leaf nodes. This is | |
often a useful requirement. You can build a tree | |
in this program without meeting this requirement, | |
but the validation code will flag it as being an | |
invalid tree | |
Enter string: | |
END | |
my Suffix $active .= new: 0, 0, -1; | |
AddPrefix( $active, $_ ) for ^$T; | |
dump_edges( $T-1 ); | |
=for validation | |
given prompt "Would you like to validate the tree? " { | |
when m:i/^ y/ { validate } | |
default {} | |
} | |
} | |
# vim: ft=perl6 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment