Skip to content

Instantly share code, notes, and snippets.

@masak
Created November 20, 2010 04:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save masak/707628 to your computer and use it in GitHub Desktop.
Save masak/707628 to your computer and use it in GitHub Desktop.
Doing .trans using a longest-token matcher
use v6;
use MONKEY_TYPING;
class LongestSubstitutionMatcher {
has Cool $!source is readonly;
has @!substitutions;
has Int $!index = 0;
has Int $!next_match;
has $!next_substitution;
has $!substitution_length;
has Str $.unsubstituted_text;
has Str $.substituted_text;
method add_substitution($key, $value) {
push @!substitutions, $key => $value;
}
submethod compare_substitution($substitution, Int $pos, Int $length) {
if $!next_match > $pos
|| $!next_match == $pos && $!substitution_length < $length {
$!next_match = $pos;
$!substitution_length = $length;
$!next_substitution = $substitution;
}
}
multi submethod triage_substitution($_ where { .key ~~ Regex }) {
my $key = .key;
return unless $!source.substr($!index) ~~ $key;
self.compare_substitution($_, $!index + $/.from, $/.to - $/.from);
}
multi submethod triage_substitution($_ where { .key ~~ Cool }) {
return unless defined index($!source, .key, $!index);
self.compare_substitution($_,
index($!source, .key, $!index),
.key.chars);
}
multi submethod triage_substitution($_) {
die "Don't know how to handle a {.WHAT} as a substitution key";
}
multi submethod increment_index(Regex $s) {
$!source.substr($!index) ~~ $s;
$!index = $!next_match + $/.chars;
}
multi submethod increment_index(Str $s) {
$!index = $!next_match + $s.chars;
}
method next_substitution() {
$!next_match = $!source.chars;
for @!substitutions {
self.triage_substitution($_);
}
$!unsubstituted_text
= $!source.substr($!index, $!next_match - $!index);
if defined $!next_substitution {
my $result = $!next_substitution.value;
$!substituted_text = $result ~~ Callable ?? $result() !! $result;
self.increment_index($!next_substitution.key);
}
return $!next_match < $!source.chars;
}
}
augment class Str {
multi method xtrans(*@changes) {
my sub expand($s) {
return $s.list if $s ~~ Iterable|Positional;
gather for $s.comb(/ (\w) '..' (\w) | . /, :match) {
if .[0] {
take $_ for ~.[0] .. ~.[1];
} else {
take ~$_;
}
}
}
my $ltm = LongestSubstitutionMatcher.new(:source(self));
for (@changes) -> $p {
die "$p.perl() is not a Pair" unless $p ~~ Pair;
if $p.key ~~ Regex {
$ltm.add_substitution($p.key, $p.value);
}
elsif $p.value ~~ Callable {
my @from = expand $p.key;
for @from -> $f {
$ltm.add_substitution($f, $p.value);
}
}
else {
my @from = expand $p.key;
my @to = expand $p.value;
if @to {
@to = @to xx ceiling(@from / @to);
} else {
@to = '' xx @from;
}
for @from Z @to -> $f, $t {
# XXX Should add contingency for key already existing
# XXX Is there a test for that?
$ltm.add_substitution($f, $t);
}
}
}
my $r = "";
while $ltm.next_substitution {
$r ~= $ltm.unsubstituted_text ~ $ltm.substituted_text;
}
$r ~= $ltm.unsubstituted_text;
return $r;
}
}
use Test;
plan 37;
is("ABC".xtrans( ('A'=>'a'), ('B'=>'b'), ('C'=>'c') ),
"abc",
"Each side can be individual characters");
is("XYZ".xtrans( ('XYZ' => 'xyz') ),
"xyz",
"The two sides of the any pair can be strings interpreted as tr/// would multichar");
is("ABC".xtrans( ('A..C' => 'a..c') ),
"abc",
"The two sides of the any pair can be strings interpreted as tr/// would range");
is("ABC-DEF".xtrans(("- AB..Z" => "_ a..z")),
"abc_def",
"If the first character is a dash it isn't part of a range");
is("ABC-DEF".xtrans(("A..YZ-" => "a..z_")),
"abc_def",
"If the last character is a dash it isn't part of a range");
is("ABCDEF".xtrans( ('AB..E' => 'ab..e') ),
"abcdeF",
"The two sides can consists of both chars and ranges");
is("ABCDEFGH".xtrans( ('A..CE..G' => 'a..ce..g') ),
"abcDefgH",
"The two sides can consist of multiple ranges");
is("ABCXYZ".xtrans( (['A'..'C'] => ['a'..'c']), (<X Y Z> => <x y z>) ),
"abcxyz",
"The two sides of each pair may also be array references" );
is("abcde".xtrans( ('a..e' => 'A'..'E') ), "ABCDE",
"Using string range on one side and array reference on the other");
is("ABCDE".xtrans( (['A' .. 'E'] => "a..e") ), "abcde",
"Using array reference on one side and string range on the other");
is("&nbsp;&lt;&gt;&amp;".xtrans( (['&nbsp;', '&lt;', '&gt;', '&amp;'] =>
[' ', '<', '>', '&' ])),
" <>&","The array version can map one characters to one-or-more characters");
is(" <>&".xtrans( ([' ', '<', '>', '&' ] =>
['&nbsp;', '&lt;', '&gt;', '&amp;' ])),
"&nbsp;&lt;&gt;&amp;",
"The array version can map one-or-more characters to one-or-more characters");
is("&nbsp;&lt;&gt;&amp;".xtrans( (['&nbsp;', '&nbsp;&lt;', '&lt;', '&gt;', '&amp;'] =>
[' ', 'AB', '<', '>', '&' ])),
"AB>&",
"The array version can map one characters to one-or-more characters, using leftmost longest match");
is("Whfg nabgure Crey unpxre".xtrans('a'..'z' => ['n'..'z','a'..'m'], 'A'..'Z' => ['N'..'Z','A'..'M']),
"Just another Perl hacker",
"Ranges can be grouped");
is("Whfg nabgure Crey unpxre".xtrans('a..z' => 'n..za..m', 'A..Z' => 'N..ZA..M'),
"Just another Perl hacker",
"Multiple ranges interpreted in string");
# Per S05 changes
{
is("Whfg nabgure Crey unpxre".xtrans(' a..z' => '_n..za..m', 'A..Z' => 'N..ZA..M'),
"Just_another_Perl_hacker",
"Spaces in interpreted ranges are not skipped (all spaces are important)");
is("Whfg nabgure Crey unpxre".xtrans('a .. z' => 'n .. za .. m', 'A .. Z' => 'N .. ZA .. M'),
"Whfg nnbgure Crey unpxre",
"Spaces in interpreted ranges are not skipped (all spaces are important)");
};
my $a = "abcdefghijklmnopqrstuvwxyz";
my $b = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
is($a.xtrans('a..z' => 'A..Z'), $b);
is($b.xtrans('A..Z' => 'a..z'), $a);
is($a.xtrans('b..y' => 'B..Y'), 'aBCDEFGHIJKLMNOPQRSTUVWXYz');
is("I\xcaJ".xtrans('I..J' => 'i..j'), "i\xcaj");
is("\x12c\x190".xtrans("\x12c" => "\x190"), "\x190\x190");
is($b.xtrans('..H..Z' => '__h..z'),
'ABCDEFGhijklmnopqrstuvwxyz',
'leading ranges interpreted as string');
is($b.xtrans('A..H..' => 'a..h__'), 'abcdefghIJKLMNOPQRSTUVWXYZ',
'trailing ranges interpreted as string');
is($b.xtrans('..A..H..' => '__a..h__'), 'abcdefghIJKLMNOPQRSTUVWXYZ',
'leading, trailing ranges interpreted as string');
# added as a consequence of RT #76720
is("hello".xtrans("l" => ""), "heo", "can replace with empty string");
# complement, squeeze/squash, delete
is('ABC123DEF456GHI'.xtrans('A..Z' => 'x'), 'xxx123xxx456xxx',
'no flags');
is('Good&Plenty'.xtrans('len' => 'x'), 'Good&Pxxxty',
'no flags');
{
# remove vowel and character after
is('abcdefghij'.xtrans(/<[aeiou]> \w/ => ''), 'cdgh', 'basic regex works');
is( # vowels become 'y' and whitespace becomes '_'
"ab\ncd\tef gh".xtrans(/<[aeiou]>/ => 'y', /\s/ => '_'),
'yb_cd_yf_gh',
'regexes pairs work',
);
my $i = 0;
is('ab_cd_ef_gh'.xtrans('_' => {$i++}), 'ab0cd1ef2gh', 'basic closure');
$i = 0;
my $j = 0;
is(
'a_b/c_d/e_f'.xtrans('_' => {$i++}, '/' => {$j++}),
'a0b0c1d1e2f',
'closure pairs work',
);
};
# RT #71088
{
lives_ok { "".subst(/x/, "").xtrans() },
'trans on subst output lives';
}
is('aaaaabbbbb'.xtrans(['aaa', 'aa', 'bb', 'bbb'] => ['1', '2', '3', '4']),
'1243',
'longest constant token preferred, regardless of declaration order');
is('foobar'.xtrans(/\w+/ => 'correct', /foo/ => 'RONG'), 'correct',
'longest regex token preferred, regardless of declaration order');
is('aaaa'.xtrans(/a/ => '1', /\w/ => '2', /./ => '3'), '1111',
'in case of a tie between regex lengths, prefer the first one');
is('ababab'.xtrans([/ab/, 'aba', 'bab', /baba/] =>
['1', '2', '3', '4' ]),
'23',
'longest token still holds, even between constant strings and regexes');
# vim: ft=perl6
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment