masak/trans-fix

## trans-fix
use v6;
use MONKEY_TYPING;

augment class Str {

    my class LSM {
        has Cool $!source is readonly;
        has      @!substitutions;

        has Int  $!index = 0;
        has Int  $!next_match;
        has      $!next_substitution;
        has      $!substitution_length;

        has Str  $.unsubstituted_text;
        has Str  $.substituted_text;

        method add_substitution($key, $value) {
            push @!substitutions, $key => $value;
        }

        submethod compare_substitution($substitution, Int $pos, Int $length) {
            if $!next_match > $pos
               || $!next_match == $pos && $!substitution_length < $length {

                $!next_match = $pos;
                $!substitution_length = $length;
                $!next_substitution = $substitution;
            }
        }

        multi submethod triage_substitution($_ where { .key ~~ Regex }) {
            my $key = .key;
            return unless $!source.substr($!index) ~~ $key;
            self.compare_substitution($_, $!index + $/.from, $/.to - $/.from);
        }

        multi submethod triage_substitution($_ where { .key ~~ Cool }) {
            return unless defined index($!source, .key, $!index);
            self.compare_substitution($_,
                                      index($!source, .key, $!index),
                                      .key.chars);
        }

        multi submethod triage_substitution($_) {
            die "Don't know how to handle a {.WHAT} as a substitution key";
        }

        multi submethod increment_index(Regex $s) {
            $!source.substr($!index) ~~ $s;
            $!index = $!next_match + $/.chars;
        }

        multi submethod increment_index(Str $s) {
            $!index = $!next_match + $s.chars;
        }

        method next_substitution() {
            $!next_match = $!source.chars;

            for @!substitutions {
                self.triage_substitution($_);
            }

            $!unsubstituted_text
                = $!source.substr($!index, $!next_match - $!index);
            if defined $!next_substitution {
                my $result = $!next_substitution.value;
                $!substituted_text
                    = $result ~~ Callable ?? $result() !! $result;
                self.increment_index($!next_substitution.key);
            }

            return $!next_match < $!source.chars;
        }
    }

    multi method xtrans(*@changes) {
        my sub expand($s) {
            return $s.list if $s ~~ Iterable|Positional;
            gather for $s.comb(/ (\w) '..' (\w) | . /, :match) {
                if .[0] {
                    take $_ for ~.[0] .. ~.[1];
                } else {
                    take ~$_;
                }
            }
        }

        my $lsm = LSM.new(:source(self));
        for (@changes) -> $p {
            die "$p.perl() is not a Pair" unless $p ~~ Pair;
            if $p.key ~~ Regex {
                $lsm.add_substitution($p.key, $p.value);
            }
            elsif $p.value ~~ Callable {
                my @from = expand $p.key;
                for @from -> $f {
                    $lsm.add_substitution($f, $p.value);
                }
            }
            else {
                my @from = expand $p.key;
                my @to   = expand $p.value;
                if @to {
                    @to = @to xx ceiling(@from / @to);
                } else {
                    @to = '' xx @from;
                }
                for @from Z @to -> $f, $t {
                    $lsm.add_substitution($f, $t);
                }
            }
        }

        my $r = "";
        while $lsm.next_substitution {
            $r ~= $lsm.unsubstituted_text ~ $lsm.substituted_text;
        }
        $r ~= $lsm.unsubstituted_text;

        return $r;
    }
}

use Test;

# L<S05/Transliteration>

if 0 {
plan 57;

is("ABC".xtrans( ('A'=>'a'), ('B'=>'b'), ('C'=>'c') ),
    "abc",
    "Each side can be individual characters");

is("XYZ".xtrans( ('XYZ' => 'xyz') ),
    "xyz",
    "The two sides of the any pair can be strings interpreted as tr/// would multichar");

is("ABC".xtrans( ('A..C' => 'a..c') ),
    "abc",
    "The two sides of the any pair can be strings interpreted as tr/// would range");

is("ABC-DEF".xtrans(("- AB..Z" => "_ a..z")),
    "abc_def",
    "If the first character is a dash it isn't part of a range");

is("ABC-DEF".xtrans(("A..YZ-" => "a..z_")),
    "abc_def",
    "If the last character is a dash it isn't part of a range");

is("ABCDEF".xtrans( ('AB..E' => 'ab..e') ),
    "abcdeF",
    "The two sides can consists of both chars and ranges");

is("ABCDEFGH".xtrans( ('A..CE..G' => 'a..ce..g') ),
    "abcDefgH",
    "The two sides can consist of multiple ranges");

is("ABCXYZ".xtrans( (['A'..'C'] => ['a'..'c']), (<X Y Z> => <x y z>) ),
    "abcxyz",
    "The two sides of each pair may also be array references" );

is("abcde".xtrans( ('a..e' => 'A'..'E') ), "ABCDE",
	   "Using string range on one side and array reference on the other");

is("ABCDE".xtrans( (['A' .. 'E'] => "a..e") ), "abcde",
	   "Using array reference on one side and string range on the other");

is("&nbsp;&lt;&gt;&amp;".xtrans( (['&nbsp;', '&lt;', '&gt;', '&amp;'] =>
    [' ',      '<',    '>',    '&'     ])),
    " <>&","The array version can map one characters to one-or-more characters");

is(" <>&".xtrans( ([' ',      '<',    '>',    '&'    ] =>
                  ['&nbsp;', '&lt;', '&gt;', '&amp;' ])),
                  "&nbsp;&lt;&gt;&amp;",
    "The array version can map one-or-more characters to one-or-more characters");

is("&nbsp;&lt;&gt;&amp;".xtrans( (['&nbsp;', '&nbsp;&lt;', '&lt;', '&gt;', '&amp;'] =>
                                 [' ',      'AB',         '<',    '>',    '&'    ])),
                                "AB>&",
    "The array version can map one characters to one-or-more characters, using leftmost longest match");

is("Whfg nabgure Crey unpxre".xtrans('a'..'z' => ['n'..'z','a'..'m'], 'A'..'Z' => ['N'..'Z','A'..'M']),
    "Just another Perl hacker",
    "Ranges can be grouped");

is("Whfg nabgure Crey unpxre".xtrans('a..z' => 'n..za..m', 'A..Z' => 'N..ZA..M'),
    "Just another Perl hacker",
    "Multiple ranges interpreted in string");

# Per S05 changes
{
is("Whfg nabgure Crey unpxre".xtrans(' a..z' => '_n..za..m', 'A..Z' => 'N..ZA..M'),
    "Just_another_Perl_hacker",
    "Spaces in interpreted ranges are not skipped (all spaces are important)");
is("Whfg nabgure Crey unpxre".xtrans('a .. z' => 'n .. za .. m', 'A .. Z' => 'N .. ZA .. M'),
    "Whfg nnbgure Crey unpxre",
    "Spaces in interpreted ranges are not skipped (all spaces are important)");
};

my $a = "abcdefghijklmnopqrstuvwxyz";

my $b = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

is($a.xtrans('a..z' => 'A..Z'), $b);

is($b.xtrans('A..Z' => 'a..z'), $a);

is($a.xtrans('b..y' => 'B..Y'), 'aBCDEFGHIJKLMNOPQRSTUVWXYz');

is("I\xcaJ".xtrans('I..J' => 'i..j'), "i\xcaj");

is("\x12c\x190".xtrans("\x12c" => "\x190"), "\x190\x190");

is($b.xtrans('A..H..' => 'a..h__'), 'abcdefghIJKLMNOPQRSTUVWXYZ',
    'trailing ranges interpreted as string');

is($b.xtrans('..A..H..' => '__a..h__'), 'abcdefghIJKLMNOPQRSTUVWXYZ',
    'leading, trailing ranges interpreted as string');

# added as a consequence of RT #76720
is("hello".xtrans("l" => ""), "heo", "can replace with empty string");

# complement, squeeze/squash, delete

is('ABC123DEF456GHI'.xtrans('A..Z' => 'x'), 'xxx123xxx456xxx',
    'no flags');

is('Good&Plenty'.xtrans('len' => 'x'), 'Good&Pxxxty',
    'no flags');

{
# remove vowel and character after
    is('abcdefghij'.xtrans(/<[aeiou]> \w/ => ''), 'cdgh', 'basic regex works');
    is( # vowels become 'y' and whitespace becomes '_'
        "ab\ncd\tef gh".xtrans(/<[aeiou]>/ => 'y', /\s/ => '_'),
        'yb_cd_yf_gh',
        'regexes pairs work',
    );

    my $i = 0;
    is('ab_cd_ef_gh'.xtrans('_' => {$i++}), 'ab0cd1ef2gh', 'basic closure');

    $i = 0;
    my $j = 0;
    is(
        'a_b/c_d/e_f'.xtrans('_' => {$i++}, '/' => {$j++}),
        'a0b0c1d1e2f',
        'closure pairs work',
    );
};

# y/// is dead
eval_dies_ok('$_ = "axbycz"; y/abc/def/', 'y/// does not exist any longer');

# RT #71088
{
    lives_ok { "".subst(/x/, "").xtrans() },
        'trans on subst output lives';
}

is('aaaaabbbbb'.xtrans(['aaa', 'aa', 'bb', 'bbb'] => ['1', '2', '3', '4']),
   '1243',
   'longest constant token preferred, regardless of declaration order');

is('foobar'.xtrans(/\w+/ => 'correct', /foo/ => 'RONG'), 'correct',
   'longest regex token preferred, regardless of declaration order');

is('aaaa'.xtrans(/a/ => '1', /\w/ => '2', /./ => '3'), '1111',
   'in case of a tie between regex lengths, prefer the first one');

is('ababab'.xtrans([/ab/, 'aba', 'bab', /baba/] =>
                   ['1',  '2',   '3',   '4'   ]),
   '23',
   'longest token still holds, even between constant strings and regexes');
}

# RT #79778
is("bare\ncode  is    lovely".xtrans( / \s+ / => ' ' ), 'bare code is lovely',
   'pair with one regex and one literal string yields right result');

# RT #79778
is("bare\ncode  is    lovely".trans( / \s+ / => ' ' ), 'bare code is lovely',
   'pair with one regex and one literal string yields right result');

# vim: ft=perl6
	use v6;
	use MONKEY_TYPING;

	augment class Str {

	my class LSM {
	has Cool $!source is readonly;
	has @!substitutions;

	has Int $!index = 0;
	has Int $!next_match;
	has $!next_substitution;
	has $!substitution_length;

	has Str $.unsubstituted_text;
	has Str $.substituted_text;

	method add_substitution($key, $value) {
	push @!substitutions, $key => $value;
	}

	submethod compare_substitution($substitution, Int $pos, Int $length) {
	if $!next_match > $pos
	\|\| $!next_match == $pos && $!substitution_length < $length {

	$!next_match = $pos;
	$!substitution_length = $length;
	$!next_substitution = $substitution;
	}
	}

	multi submethod triage_substitution($_ where { .key ~~ Regex }) {
	my $key = .key;
	return unless $!source.substr($!index) ~~ $key;
	self.compare_substitution($_, $!index + $/.from, $/.to - $/.from);
	}

	multi submethod triage_substitution($_ where { .key ~~ Cool }) {
	return unless defined index($!source, .key, $!index);
	self.compare_substitution($_,
	index($!source, .key, $!index),
	.key.chars);
	}

	multi submethod triage_substitution($_) {
	die "Don't know how to handle a {.WHAT} as a substitution key";
	}

	multi submethod increment_index(Regex $s) {
	$!source.substr($!index) ~~ $s;
	$!index = $!next_match + $/.chars;
	}

	multi submethod increment_index(Str $s) {
	$!index = $!next_match + $s.chars;
	}

	method next_substitution() {
	$!next_match = $!source.chars;

	for @!substitutions {
	self.triage_substitution($_);
	}

	$!unsubstituted_text
	= $!source.substr($!index, $!next_match - $!index);
	if defined $!next_substitution {
	my $result = $!next_substitution.value;
	$!substituted_text
	= $result ~~ Callable ?? $result() !! $result;
	self.increment_index($!next_substitution.key);
	}

	return $!next_match < $!source.chars;
	}
	}

	multi method xtrans(*@changes) {
	my sub expand($s) {
	return $s.list if $s ~~ Iterable\|Positional;
	gather for $s.comb(/ (\w) '..' (\w) \| . /, :match) {
	if .[0] {
	take $_ for ~.[0] .. ~.[1];
	} else {
	take ~$_;
	}
	}
	}

	my $lsm = LSM.new(:source(self));
	for (@changes) -> $p {
	die "$p.perl() is not a Pair" unless $p ~~ Pair;
	if $p.key ~~ Regex {
	$lsm.add_substitution($p.key, $p.value);
	}
	elsif $p.value ~~ Callable {
	my @from = expand $p.key;
	for @from -> $f {
	$lsm.add_substitution($f, $p.value);
	}
	}
	else {
	my @from = expand $p.key;
	my @to = expand $p.value;
	if @to {
	@to = @to xx ceiling(@from / @to);
	} else {
	@to = '' xx @from;
	}
	for @from Z @to -> $f, $t {
	$lsm.add_substitution($f, $t);
	}
	}
	}

	my $r = "";
	while $lsm.next_substitution {
	$r ~= $lsm.unsubstituted_text ~ $lsm.substituted_text;
	}
	$r ~= $lsm.unsubstituted_text;

	return $r;
	}
	}

	use Test;

	# L<S05/Transliteration>

	if 0 {
	plan 57;

	is("ABC".xtrans( ('A'=>'a'), ('B'=>'b'), ('C'=>'c') ),
	"abc",
	"Each side can be individual characters");

	is("XYZ".xtrans( ('XYZ' => 'xyz') ),
	"xyz",
	"The two sides of the any pair can be strings interpreted as tr/// would multichar");

	is("ABC".xtrans( ('A..C' => 'a..c') ),
	"abc",
	"The two sides of the any pair can be strings interpreted as tr/// would range");

	is("ABC-DEF".xtrans(("- AB..Z" => "_ a..z")),
	"abc_def",
	"If the first character is a dash it isn't part of a range");

	is("ABC-DEF".xtrans(("A..YZ-" => "a..z_")),
	"abc_def",
	"If the last character is a dash it isn't part of a range");

	is("ABCDEF".xtrans( ('AB..E' => 'ab..e') ),
	"abcdeF",
	"The two sides can consists of both chars and ranges");

	is("ABCDEFGH".xtrans( ('A..CE..G' => 'a..ce..g') ),
	"abcDefgH",
	"The two sides can consist of multiple ranges");

	is("ABCXYZ".xtrans( (['A'..'C'] => ['a'..'c']), (<X Y Z> => <x y z>) ),
	"abcxyz",
	"The two sides of each pair may also be array references" );

	is("abcde".xtrans( ('a..e' => 'A'..'E') ), "ABCDE",
	"Using string range on one side and array reference on the other");

	is("ABCDE".xtrans( (['A' .. 'E'] => "a..e") ), "abcde",
	"Using array reference on one side and string range on the other");

	is(" <>&".xtrans( ([' ', '<', '>', '&'] =>
	[' ', '<', '>', '&' ])),
	" <>&","The array version can map one characters to one-or-more characters");

	is(" <>&".xtrans( ([' ', '<', '>', '&' ] =>
	[' ', '<', '>', '&' ])),
	" <>&",
	"The array version can map one-or-more characters to one-or-more characters");

	is(" <>&".xtrans( ([' ', ' <', '<', '>', '&'] =>
	[' ', 'AB', '<', '>', '&' ])),
	"AB>&",
	"The array version can map one characters to one-or-more characters, using leftmost longest match");

	is("Whfg nabgure Crey unpxre".xtrans('a'..'z' => ['n'..'z','a'..'m'], 'A'..'Z' => ['N'..'Z','A'..'M']),
	"Just another Perl hacker",
	"Ranges can be grouped");

	is("Whfg nabgure Crey unpxre".xtrans('a..z' => 'n..za..m', 'A..Z' => 'N..ZA..M'),
	"Just another Perl hacker",
	"Multiple ranges interpreted in string");

	# Per S05 changes
	{
	is("Whfg nabgure Crey unpxre".xtrans(' a..z' => '_n..za..m', 'A..Z' => 'N..ZA..M'),
	"Just_another_Perl_hacker",
	"Spaces in interpreted ranges are not skipped (all spaces are important)");
	is("Whfg nabgure Crey unpxre".xtrans('a .. z' => 'n .. za .. m', 'A .. Z' => 'N .. ZA .. M'),
	"Whfg nnbgure Crey unpxre",
	"Spaces in interpreted ranges are not skipped (all spaces are important)");
	};

	my $a = "abcdefghijklmnopqrstuvwxyz";

	my $b = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

	is($a.xtrans('a..z' => 'A..Z'), $b);

	is($b.xtrans('A..Z' => 'a..z'), $a);

	is($a.xtrans('b..y' => 'B..Y'), 'aBCDEFGHIJKLMNOPQRSTUVWXYz');

	is("I\xcaJ".xtrans('I..J' => 'i..j'), "i\xcaj");

	is("\x12c\x190".xtrans("\x12c" => "\x190"), "\x190\x190");

	is($b.xtrans('A..H..' => 'a..h__'), 'abcdefghIJKLMNOPQRSTUVWXYZ',
	'trailing ranges interpreted as string');

	is($b.xtrans('..A..H..' => '__a..h__'), 'abcdefghIJKLMNOPQRSTUVWXYZ',
	'leading, trailing ranges interpreted as string');

	# added as a consequence of RT #76720
	is("hello".xtrans("l" => ""), "heo", "can replace with empty string");

	# complement, squeeze/squash, delete

	is('ABC123DEF456GHI'.xtrans('A..Z' => 'x'), 'xxx123xxx456xxx',
	'no flags');

	is('Good&Plenty'.xtrans('len' => 'x'), 'Good&Pxxxty',
	'no flags');

	{
	# remove vowel and character after
	is('abcdefghij'.xtrans(/<[aeiou]> \w/ => ''), 'cdgh', 'basic regex works');
	is( # vowels become 'y' and whitespace becomes '_'
	"ab\ncd\tef gh".xtrans(/<[aeiou]>/ => 'y', /\s/ => '_'),
	'yb_cd_yf_gh',
	'regexes pairs work',
	);

	my $i = 0;
	is('ab_cd_ef_gh'.xtrans('_' => {$i++}), 'ab0cd1ef2gh', 'basic closure');

	$i = 0;
	my $j = 0;
	is(
	'a_b/c_d/e_f'.xtrans('_' => {$i++}, '/' => {$j++}),
	'a0b0c1d1e2f',
	'closure pairs work',
	);
	};

	# y/// is dead
	eval_dies_ok('$_ = "axbycz"; y/abc/def/', 'y/// does not exist any longer');

	# RT #71088
	{
	lives_ok { "".subst(/x/, "").xtrans() },
	'trans on subst output lives';
	}

	is('aaaaabbbbb'.xtrans(['aaa', 'aa', 'bb', 'bbb'] => ['1', '2', '3', '4']),
	'1243',
	'longest constant token preferred, regardless of declaration order');

	is('foobar'.xtrans(/\w+/ => 'correct', /foo/ => 'RONG'), 'correct',
	'longest regex token preferred, regardless of declaration order');

	is('aaaa'.xtrans(/a/ => '1', /\w/ => '2', /./ => '3'), '1111',
	'in case of a tie between regex lengths, prefer the first one');

	is('ababab'.xtrans([/ab/, 'aba', 'bab', /baba/] =>
	['1', '2', '3', '4' ]),
	'23',
	'longest token still holds, even between constant strings and regexes');
	}

	# RT #79778
	is("bare\ncode is lovely".xtrans( / \s+ / => ' ' ), 'bare code is lovely',
	'pair with one regex and one literal string yields right result');

	# RT #79778
	is("bare\ncode is lovely".trans( / \s+ / => ' ' ), 'bare code is lovely',
	'pair with one regex and one literal string yields right result');

	# vim: ft=perl6