masak/new-trans-implementation.p6

## new-trans-implementation.p6
use v6;
use MONKEY_TYPING;

class LongestSubstitutionMatcher {
    has Cool $!source is readonly;
    has      @!substitutions;

    has Int  $!index = 0;
    has Int  $!next_match;
    has      $!next_substitution;
    has      $!substitution_length;

    has Str  $.unsubstituted_text;
    has Str  $.substituted_text;

    method add_substitution($key, $value) {
        push @!substitutions, $key => $value;
    }

    submethod compare_substitution($substitution, Int $pos, Int $length) {
        if $!next_match > $pos
           || $!next_match == $pos && $!substitution_length < $length {

            $!next_match = $pos;
            $!substitution_length = $length;
            $!next_substitution = $substitution;
        }
    }

    multi submethod triage_substitution($_ where { .key ~~ Regex }) {
        my $key = .key;
        return unless $!source.substr($!index) ~~ $key;
        self.compare_substitution($_, $!index + $/.from, $/.to - $/.from);
    }

    multi submethod triage_substitution($_ where { .key ~~ Cool }) {
        return unless defined index($!source, .key, $!index);
        self.compare_substitution($_,
                                  index($!source, .key, $!index),
                                  .key.chars);
    }

    multi submethod triage_substitution($_) {
        die "Don't know how to handle a {.WHAT} as a substitution key";
    }

    multi submethod increment_index(Regex $s) {
        $!source.substr($!index) ~~ $s;
        $!index = $!next_match + $/.chars;
    }

    multi submethod increment_index(Str $s) {
        $!index = $!next_match + $s.chars;
    }

    method next_substitution() {
        $!next_match = $!source.chars;

        for @!substitutions {
            self.triage_substitution($_);
        }

        $!unsubstituted_text
            = $!source.substr($!index, $!next_match - $!index);
        if defined $!next_substitution {
            my $result = $!next_substitution.value;
            $!substituted_text = $result ~~ Callable ?? $result() !! $result;
            self.increment_index($!next_substitution.key);
        }

        return $!next_match < $!source.chars;
    }
}

augment class Str {

    multi method xtrans(*@changes) {
        my sub expand($s) {
            return $s.list if $s ~~ Iterable|Positional;
            gather for $s.comb(/ (\w) '..' (\w) | . /, :match) {
                if .[0] {
                    take $_ for ~.[0] .. ~.[1];
                } else {
                    take ~$_;
                }
            }
        }

        my $ltm = LongestSubstitutionMatcher.new(:source(self));
        for (@changes) -> $p {
            die "$p.perl() is not a Pair" unless $p ~~ Pair;
            if $p.key ~~ Regex {
                $ltm.add_substitution($p.key, $p.value);
            }
            elsif $p.value ~~ Callable {
                my @from = expand $p.key;
                for @from -> $f {
                    $ltm.add_substitution($f, $p.value);
                }
            }
            else {
                my @from = expand $p.key;
                my @to   = expand $p.value;
                if @to {
                    @to = @to xx ceiling(@from / @to);
                } else {
                    @to = '' xx @from;
                }
                for @from Z @to -> $f, $t {
                    # XXX Should add contingency for key already existing
                    # XXX Is there a test for that?
                    $ltm.add_substitution($f, $t);
                }
            }
        }

        my $r = "";
        while $ltm.next_substitution {
            $r ~= $ltm.unsubstituted_text ~ $ltm.substituted_text;
        }
        $r ~= $ltm.unsubstituted_text;

        return $r;
    }
}

use Test;

plan 37;

is("ABC".xtrans( ('A'=>'a'), ('B'=>'b'), ('C'=>'c') ),
    "abc",
    "Each side can be individual characters");

is("XYZ".xtrans( ('XYZ' => 'xyz') ),
    "xyz",
    "The two sides of the any pair can be strings interpreted as tr/// would multichar");

is("ABC".xtrans( ('A..C' => 'a..c') ),
    "abc",
    "The two sides of the any pair can be strings interpreted as tr/// would range");

is("ABC-DEF".xtrans(("- AB..Z" => "_ a..z")),
    "abc_def",
    "If the first character is a dash it isn't part of a range");

is("ABC-DEF".xtrans(("A..YZ-" => "a..z_")),
    "abc_def",
    "If the last character is a dash it isn't part of a range");

is("ABCDEF".xtrans( ('AB..E' => 'ab..e') ),
    "abcdeF",
    "The two sides can consists of both chars and ranges");

is("ABCDEFGH".xtrans( ('A..CE..G' => 'a..ce..g') ),
    "abcDefgH",
    "The two sides can consist of multiple ranges");

is("ABCXYZ".xtrans( (['A'..'C'] => ['a'..'c']), (<X Y Z> => <x y z>) ),
    "abcxyz",
    "The two sides of each pair may also be array references" );

is("abcde".xtrans( ('a..e' => 'A'..'E') ), "ABCDE",
	   "Using string range on one side and array reference on the other");

is("ABCDE".xtrans( (['A' .. 'E'] => "a..e") ), "abcde",
	   "Using array reference on one side and string range on the other");

is("&nbsp;&lt;&gt;&amp;".xtrans( (['&nbsp;', '&lt;', '&gt;', '&amp;'] =>
    [' ',      '<',    '>',    '&'     ])),
    " <>&","The array version can map one characters to one-or-more characters");

is(" <>&".xtrans( ([' ',      '<',    '>',    '&'    ] =>
                  ['&nbsp;', '&lt;', '&gt;', '&amp;' ])),
                  "&nbsp;&lt;&gt;&amp;",
    "The array version can map one-or-more characters to one-or-more characters");

is("&nbsp;&lt;&gt;&amp;".xtrans( (['&nbsp;', '&nbsp;&lt;', '&lt;', '&gt;', '&amp;'] =>
                                 [' ',      'AB',         '<',    '>',    '&'    ])),
                                "AB>&",
    "The array version can map one characters to one-or-more characters, using leftmost longest match");

is("Whfg nabgure Crey unpxre".xtrans('a'..'z' => ['n'..'z','a'..'m'], 'A'..'Z' => ['N'..'Z','A'..'M']),
    "Just another Perl hacker",
    "Ranges can be grouped");

is("Whfg nabgure Crey unpxre".xtrans('a..z' => 'n..za..m', 'A..Z' => 'N..ZA..M'),
    "Just another Perl hacker",
    "Multiple ranges interpreted in string");

# Per S05 changes
{
is("Whfg nabgure Crey unpxre".xtrans(' a..z' => '_n..za..m', 'A..Z' => 'N..ZA..M'),
    "Just_another_Perl_hacker",
    "Spaces in interpreted ranges are not skipped (all spaces are important)");
is("Whfg nabgure Crey unpxre".xtrans('a .. z' => 'n .. za .. m', 'A .. Z' => 'N .. ZA .. M'),
    "Whfg nnbgure Crey unpxre",
    "Spaces in interpreted ranges are not skipped (all spaces are important)");
};

my $a = "abcdefghijklmnopqrstuvwxyz";

my $b = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

is($a.xtrans('a..z' => 'A..Z'), $b);

is($b.xtrans('A..Z' => 'a..z'), $a);

is($a.xtrans('b..y' => 'B..Y'), 'aBCDEFGHIJKLMNOPQRSTUVWXYz');

is("I\xcaJ".xtrans('I..J' => 'i..j'), "i\xcaj");

is("\x12c\x190".xtrans("\x12c" => "\x190"), "\x190\x190");

is($b.xtrans('..H..Z' => '__h..z'),
    'ABCDEFGhijklmnopqrstuvwxyz',
    'leading ranges interpreted as string');

is($b.xtrans('A..H..' => 'a..h__'), 'abcdefghIJKLMNOPQRSTUVWXYZ',
    'trailing ranges interpreted as string');

is($b.xtrans('..A..H..' => '__a..h__'), 'abcdefghIJKLMNOPQRSTUVWXYZ',
    'leading, trailing ranges interpreted as string');

# added as a consequence of RT #76720
is("hello".xtrans("l" => ""), "heo", "can replace with empty string");

# complement, squeeze/squash, delete

is('ABC123DEF456GHI'.xtrans('A..Z' => 'x'), 'xxx123xxx456xxx',
    'no flags');

is('Good&Plenty'.xtrans('len' => 'x'), 'Good&Pxxxty',
    'no flags');

{
# remove vowel and character after
    is('abcdefghij'.xtrans(/<[aeiou]> \w/ => ''), 'cdgh', 'basic regex works');
    is( # vowels become 'y' and whitespace becomes '_'
        "ab\ncd\tef gh".xtrans(/<[aeiou]>/ => 'y', /\s/ => '_'),
        'yb_cd_yf_gh',
        'regexes pairs work',
    );

    my $i = 0;
    is('ab_cd_ef_gh'.xtrans('_' => {$i++}), 'ab0cd1ef2gh', 'basic closure');

    $i = 0;
    my $j = 0;
    is(
        'a_b/c_d/e_f'.xtrans('_' => {$i++}, '/' => {$j++}),
        'a0b0c1d1e2f',
        'closure pairs work',
    );
};

# RT #71088
{
    lives_ok { "".subst(/x/, "").xtrans() },
        'trans on subst output lives';
}

is('aaaaabbbbb'.xtrans(['aaa', 'aa', 'bb', 'bbb'] => ['1', '2', '3', '4']),
   '1243',
   'longest constant token preferred, regardless of declaration order');

is('foobar'.xtrans(/\w+/ => 'correct', /foo/ => 'RONG'), 'correct',
   'longest regex token preferred, regardless of declaration order');

is('aaaa'.xtrans(/a/ => '1', /\w/ => '2', /./ => '3'), '1111',
   'in case of a tie between regex lengths, prefer the first one');

is('ababab'.xtrans([/ab/, 'aba', 'bab', /baba/] =>
                   ['1',  '2',   '3',   '4'   ]),
   '23',
   'longest token still holds, even between constant strings and regexes');

# vim: ft=perl6
	use v6;
	use MONKEY_TYPING;

	class LongestSubstitutionMatcher {
	has Cool $!source is readonly;
	has @!substitutions;

	has Int $!index = 0;
	has Int $!next_match;
	has $!next_substitution;
	has $!substitution_length;

	has Str $.unsubstituted_text;
	has Str $.substituted_text;

	method add_substitution($key, $value) {
	push @!substitutions, $key => $value;
	}

	submethod compare_substitution($substitution, Int $pos, Int $length) {
	if $!next_match > $pos
	\|\| $!next_match == $pos && $!substitution_length < $length {

	$!next_match = $pos;
	$!substitution_length = $length;
	$!next_substitution = $substitution;
	}
	}

	multi submethod triage_substitution($_ where { .key ~~ Regex }) {
	my $key = .key;
	return unless $!source.substr($!index) ~~ $key;
	self.compare_substitution($_, $!index + $/.from, $/.to - $/.from);
	}

	multi submethod triage_substitution($_ where { .key ~~ Cool }) {
	return unless defined index($!source, .key, $!index);
	self.compare_substitution($_,
	index($!source, .key, $!index),
	.key.chars);
	}

	multi submethod triage_substitution($_) {
	die "Don't know how to handle a {.WHAT} as a substitution key";
	}

	multi submethod increment_index(Regex $s) {
	$!source.substr($!index) ~~ $s;
	$!index = $!next_match + $/.chars;
	}

	multi submethod increment_index(Str $s) {
	$!index = $!next_match + $s.chars;
	}

	method next_substitution() {
	$!next_match = $!source.chars;

	for @!substitutions {
	self.triage_substitution($_);
	}

	$!unsubstituted_text
	= $!source.substr($!index, $!next_match - $!index);
	if defined $!next_substitution {
	my $result = $!next_substitution.value;
	$!substituted_text = $result ~~ Callable ?? $result() !! $result;
	self.increment_index($!next_substitution.key);
	}

	return $!next_match < $!source.chars;
	}
	}

	augment class Str {

	multi method xtrans(*@changes) {
	my sub expand($s) {
	return $s.list if $s ~~ Iterable\|Positional;
	gather for $s.comb(/ (\w) '..' (\w) \| . /, :match) {
	if .[0] {
	take $_ for ~.[0] .. ~.[1];
	} else {
	take ~$_;
	}
	}
	}

	my $ltm = LongestSubstitutionMatcher.new(:source(self));
	for (@changes) -> $p {
	die "$p.perl() is not a Pair" unless $p ~~ Pair;
	if $p.key ~~ Regex {
	$ltm.add_substitution($p.key, $p.value);
	}
	elsif $p.value ~~ Callable {
	my @from = expand $p.key;
	for @from -> $f {
	$ltm.add_substitution($f, $p.value);
	}
	}
	else {
	my @from = expand $p.key;
	my @to = expand $p.value;
	if @to {
	@to = @to xx ceiling(@from / @to);
	} else {
	@to = '' xx @from;
	}
	for @from Z @to -> $f, $t {
	# XXX Should add contingency for key already existing
	# XXX Is there a test for that?
	$ltm.add_substitution($f, $t);
	}
	}
	}

	my $r = "";
	while $ltm.next_substitution {
	$r ~= $ltm.unsubstituted_text ~ $ltm.substituted_text;
	}
	$r ~= $ltm.unsubstituted_text;

	return $r;
	}
	}

	use Test;

	plan 37;

	is("ABC".xtrans( ('A'=>'a'), ('B'=>'b'), ('C'=>'c') ),
	"abc",
	"Each side can be individual characters");

	is("XYZ".xtrans( ('XYZ' => 'xyz') ),
	"xyz",
	"The two sides of the any pair can be strings interpreted as tr/// would multichar");

	is("ABC".xtrans( ('A..C' => 'a..c') ),
	"abc",
	"The two sides of the any pair can be strings interpreted as tr/// would range");

	is("ABC-DEF".xtrans(("- AB..Z" => "_ a..z")),
	"abc_def",
	"If the first character is a dash it isn't part of a range");

	is("ABC-DEF".xtrans(("A..YZ-" => "a..z_")),
	"abc_def",
	"If the last character is a dash it isn't part of a range");

	is("ABCDEF".xtrans( ('AB..E' => 'ab..e') ),
	"abcdeF",
	"The two sides can consists of both chars and ranges");

	is("ABCDEFGH".xtrans( ('A..CE..G' => 'a..ce..g') ),
	"abcDefgH",
	"The two sides can consist of multiple ranges");

	is("ABCXYZ".xtrans( (['A'..'C'] => ['a'..'c']), (<X Y Z> => <x y z>) ),
	"abcxyz",
	"The two sides of each pair may also be array references" );

	is("abcde".xtrans( ('a..e' => 'A'..'E') ), "ABCDE",
	"Using string range on one side and array reference on the other");

	is("ABCDE".xtrans( (['A' .. 'E'] => "a..e") ), "abcde",
	"Using array reference on one side and string range on the other");

	is(" <>&".xtrans( ([' ', '<', '>', '&'] =>
	[' ', '<', '>', '&' ])),
	" <>&","The array version can map one characters to one-or-more characters");

	is(" <>&".xtrans( ([' ', '<', '>', '&' ] =>
	[' ', '<', '>', '&' ])),
	" <>&",
	"The array version can map one-or-more characters to one-or-more characters");

	is(" <>&".xtrans( ([' ', ' <', '<', '>', '&'] =>
	[' ', 'AB', '<', '>', '&' ])),
	"AB>&",
	"The array version can map one characters to one-or-more characters, using leftmost longest match");

	is("Whfg nabgure Crey unpxre".xtrans('a'..'z' => ['n'..'z','a'..'m'], 'A'..'Z' => ['N'..'Z','A'..'M']),
	"Just another Perl hacker",
	"Ranges can be grouped");

	is("Whfg nabgure Crey unpxre".xtrans('a..z' => 'n..za..m', 'A..Z' => 'N..ZA..M'),
	"Just another Perl hacker",
	"Multiple ranges interpreted in string");

	# Per S05 changes
	{
	is("Whfg nabgure Crey unpxre".xtrans(' a..z' => '_n..za..m', 'A..Z' => 'N..ZA..M'),
	"Just_another_Perl_hacker",
	"Spaces in interpreted ranges are not skipped (all spaces are important)");
	is("Whfg nabgure Crey unpxre".xtrans('a .. z' => 'n .. za .. m', 'A .. Z' => 'N .. ZA .. M'),
	"Whfg nnbgure Crey unpxre",
	"Spaces in interpreted ranges are not skipped (all spaces are important)");
	};

	my $a = "abcdefghijklmnopqrstuvwxyz";

	my $b = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

	is($a.xtrans('a..z' => 'A..Z'), $b);

	is($b.xtrans('A..Z' => 'a..z'), $a);

	is($a.xtrans('b..y' => 'B..Y'), 'aBCDEFGHIJKLMNOPQRSTUVWXYz');

	is("I\xcaJ".xtrans('I..J' => 'i..j'), "i\xcaj");

	is("\x12c\x190".xtrans("\x12c" => "\x190"), "\x190\x190");

	is($b.xtrans('..H..Z' => '__h..z'),
	'ABCDEFGhijklmnopqrstuvwxyz',
	'leading ranges interpreted as string');

	is($b.xtrans('A..H..' => 'a..h__'), 'abcdefghIJKLMNOPQRSTUVWXYZ',
	'trailing ranges interpreted as string');

	is($b.xtrans('..A..H..' => '__a..h__'), 'abcdefghIJKLMNOPQRSTUVWXYZ',
	'leading, trailing ranges interpreted as string');

	# added as a consequence of RT #76720
	is("hello".xtrans("l" => ""), "heo", "can replace with empty string");

	# complement, squeeze/squash, delete

	is('ABC123DEF456GHI'.xtrans('A..Z' => 'x'), 'xxx123xxx456xxx',
	'no flags');

	is('Good&Plenty'.xtrans('len' => 'x'), 'Good&Pxxxty',
	'no flags');

	{
	# remove vowel and character after
	is('abcdefghij'.xtrans(/<[aeiou]> \w/ => ''), 'cdgh', 'basic regex works');
	is( # vowels become 'y' and whitespace becomes '_'
	"ab\ncd\tef gh".xtrans(/<[aeiou]>/ => 'y', /\s/ => '_'),
	'yb_cd_yf_gh',
	'regexes pairs work',
	);

	my $i = 0;
	is('ab_cd_ef_gh'.xtrans('_' => {$i++}), 'ab0cd1ef2gh', 'basic closure');

	$i = 0;
	my $j = 0;
	is(
	'a_b/c_d/e_f'.xtrans('_' => {$i++}, '/' => {$j++}),
	'a0b0c1d1e2f',
	'closure pairs work',
	);
	};

	# RT #71088
	{
	lives_ok { "".subst(/x/, "").xtrans() },
	'trans on subst output lives';
	}

	is('aaaaabbbbb'.xtrans(['aaa', 'aa', 'bb', 'bbb'] => ['1', '2', '3', '4']),
	'1243',
	'longest constant token preferred, regardless of declaration order');

	is('foobar'.xtrans(/\w+/ => 'correct', /foo/ => 'RONG'), 'correct',
	'longest regex token preferred, regardless of declaration order');

	is('aaaa'.xtrans(/a/ => '1', /\w/ => '2', /./ => '3'), '1111',
	'in case of a tie between regex lengths, prefer the first one');

	is('ababab'.xtrans([/ab/, 'aba', 'bab', /baba/] =>
	['1', '2', '3', '4' ]),
	'23',
	'longest token still holds, even between constant strings and regexes');

	# vim: ft=perl6