Created
November 26, 2010 09:57
-
-
Save masak/716485 to your computer and use it in GitHub Desktop.
disturbing result: .trans works differently in and out of Rakudo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use v6; | |
use MONKEY_TYPING; | |
augment class Str { | |
my class LSM { | |
has Cool $!source is readonly; | |
has @!substitutions; | |
has Int $!index = 0; | |
has Int $!next_match; | |
has $!next_substitution; | |
has $!substitution_length; | |
has Str $.unsubstituted_text; | |
has Str $.substituted_text; | |
method add_substitution($key, $value) { | |
push @!substitutions, $key => $value; | |
} | |
submethod compare_substitution($substitution, Int $pos, Int $length) { | |
if $!next_match > $pos | |
|| $!next_match == $pos && $!substitution_length < $length { | |
$!next_match = $pos; | |
$!substitution_length = $length; | |
$!next_substitution = $substitution; | |
} | |
} | |
multi submethod triage_substitution($_ where { .key ~~ Regex }) { | |
my $key = .key; | |
return unless $!source.substr($!index) ~~ $key; | |
self.compare_substitution($_, $!index + $/.from, $/.to - $/.from); | |
} | |
multi submethod triage_substitution($_ where { .key ~~ Cool }) { | |
return unless defined index($!source, .key, $!index); | |
self.compare_substitution($_, | |
index($!source, .key, $!index), | |
.key.chars); | |
} | |
multi submethod triage_substitution($_) { | |
die "Don't know how to handle a {.WHAT} as a substitution key"; | |
} | |
multi submethod increment_index(Regex $s) { | |
$!source.substr($!index) ~~ $s; | |
$!index = $!next_match + $/.chars; | |
} | |
multi submethod increment_index(Str $s) { | |
$!index = $!next_match + $s.chars; | |
} | |
method next_substitution() { | |
$!next_match = $!source.chars; | |
for @!substitutions { | |
self.triage_substitution($_); | |
} | |
$!unsubstituted_text | |
= $!source.substr($!index, $!next_match - $!index); | |
if defined $!next_substitution { | |
my $result = $!next_substitution.value; | |
$!substituted_text | |
= $result ~~ Callable ?? $result() !! $result; | |
self.increment_index($!next_substitution.key); | |
} | |
return $!next_match < $!source.chars; | |
} | |
} | |
multi method xtrans(*@changes) { | |
my sub expand($s) { | |
return $s.list if $s ~~ Iterable|Positional; | |
gather for $s.comb(/ (\w) '..' (\w) | . /, :match) { | |
if .[0] { | |
take $_ for ~.[0] .. ~.[1]; | |
} else { | |
take ~$_; | |
} | |
} | |
} | |
my $lsm = LSM.new(:source(self)); | |
for (@changes) -> $p { | |
die "$p.perl() is not a Pair" unless $p ~~ Pair; | |
if $p.key ~~ Regex { | |
$lsm.add_substitution($p.key, $p.value); | |
} | |
elsif $p.value ~~ Callable { | |
my @from = expand $p.key; | |
for @from -> $f { | |
$lsm.add_substitution($f, $p.value); | |
} | |
} | |
else { | |
my @from = expand $p.key; | |
my @to = expand $p.value; | |
if @to { | |
@to = @to xx ceiling(@from / @to); | |
} else { | |
@to = '' xx @from; | |
} | |
for @from Z @to -> $f, $t { | |
$lsm.add_substitution($f, $t); | |
} | |
} | |
} | |
my $r = ""; | |
while $lsm.next_substitution { | |
$r ~= $lsm.unsubstituted_text ~ $lsm.substituted_text; | |
} | |
$r ~= $lsm.unsubstituted_text; | |
return $r; | |
} | |
} | |
use Test; | |
# L<S05/Transliteration> | |
if 0 { | |
plan 57; | |
is("ABC".xtrans( ('A'=>'a'), ('B'=>'b'), ('C'=>'c') ), | |
"abc", | |
"Each side can be individual characters"); | |
is("XYZ".xtrans( ('XYZ' => 'xyz') ), | |
"xyz", | |
"The two sides of the any pair can be strings interpreted as tr/// would multichar"); | |
is("ABC".xtrans( ('A..C' => 'a..c') ), | |
"abc", | |
"The two sides of the any pair can be strings interpreted as tr/// would range"); | |
is("ABC-DEF".xtrans(("- AB..Z" => "_ a..z")), | |
"abc_def", | |
"If the first character is a dash it isn't part of a range"); | |
is("ABC-DEF".xtrans(("A..YZ-" => "a..z_")), | |
"abc_def", | |
"If the last character is a dash it isn't part of a range"); | |
is("ABCDEF".xtrans( ('AB..E' => 'ab..e') ), | |
"abcdeF", | |
"The two sides can consists of both chars and ranges"); | |
is("ABCDEFGH".xtrans( ('A..CE..G' => 'a..ce..g') ), | |
"abcDefgH", | |
"The two sides can consist of multiple ranges"); | |
is("ABCXYZ".xtrans( (['A'..'C'] => ['a'..'c']), (<X Y Z> => <x y z>) ), | |
"abcxyz", | |
"The two sides of each pair may also be array references" ); | |
is("abcde".xtrans( ('a..e' => 'A'..'E') ), "ABCDE", | |
"Using string range on one side and array reference on the other"); | |
is("ABCDE".xtrans( (['A' .. 'E'] => "a..e") ), "abcde", | |
"Using array reference on one side and string range on the other"); | |
is(" <>&".xtrans( ([' ', '<', '>', '&'] => | |
[' ', '<', '>', '&' ])), | |
" <>&","The array version can map one characters to one-or-more characters"); | |
is(" <>&".xtrans( ([' ', '<', '>', '&' ] => | |
[' ', '<', '>', '&' ])), | |
" <>&", | |
"The array version can map one-or-more characters to one-or-more characters"); | |
is(" <>&".xtrans( ([' ', ' <', '<', '>', '&'] => | |
[' ', 'AB', '<', '>', '&' ])), | |
"AB>&", | |
"The array version can map one characters to one-or-more characters, using leftmost longest match"); | |
is("Whfg nabgure Crey unpxre".xtrans('a'..'z' => ['n'..'z','a'..'m'], 'A'..'Z' => ['N'..'Z','A'..'M']), | |
"Just another Perl hacker", | |
"Ranges can be grouped"); | |
is("Whfg nabgure Crey unpxre".xtrans('a..z' => 'n..za..m', 'A..Z' => 'N..ZA..M'), | |
"Just another Perl hacker", | |
"Multiple ranges interpreted in string"); | |
# Per S05 changes | |
{ | |
is("Whfg nabgure Crey unpxre".xtrans(' a..z' => '_n..za..m', 'A..Z' => 'N..ZA..M'), | |
"Just_another_Perl_hacker", | |
"Spaces in interpreted ranges are not skipped (all spaces are important)"); | |
is("Whfg nabgure Crey unpxre".xtrans('a .. z' => 'n .. za .. m', 'A .. Z' => 'N .. ZA .. M'), | |
"Whfg nnbgure Crey unpxre", | |
"Spaces in interpreted ranges are not skipped (all spaces are important)"); | |
}; | |
my $a = "abcdefghijklmnopqrstuvwxyz"; | |
my $b = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; | |
is($a.xtrans('a..z' => 'A..Z'), $b); | |
is($b.xtrans('A..Z' => 'a..z'), $a); | |
is($a.xtrans('b..y' => 'B..Y'), 'aBCDEFGHIJKLMNOPQRSTUVWXYz'); | |
is("I\xcaJ".xtrans('I..J' => 'i..j'), "i\xcaj"); | |
is("\x12c\x190".xtrans("\x12c" => "\x190"), "\x190\x190"); | |
is($b.xtrans('A..H..' => 'a..h__'), 'abcdefghIJKLMNOPQRSTUVWXYZ', | |
'trailing ranges interpreted as string'); | |
is($b.xtrans('..A..H..' => '__a..h__'), 'abcdefghIJKLMNOPQRSTUVWXYZ', | |
'leading, trailing ranges interpreted as string'); | |
# added as a consequence of RT #76720 | |
is("hello".xtrans("l" => ""), "heo", "can replace with empty string"); | |
# complement, squeeze/squash, delete | |
is('ABC123DEF456GHI'.xtrans('A..Z' => 'x'), 'xxx123xxx456xxx', | |
'no flags'); | |
is('Good&Plenty'.xtrans('len' => 'x'), 'Good&Pxxxty', | |
'no flags'); | |
{ | |
# remove vowel and character after | |
is('abcdefghij'.xtrans(/<[aeiou]> \w/ => ''), 'cdgh', 'basic regex works'); | |
is( # vowels become 'y' and whitespace becomes '_' | |
"ab\ncd\tef gh".xtrans(/<[aeiou]>/ => 'y', /\s/ => '_'), | |
'yb_cd_yf_gh', | |
'regexes pairs work', | |
); | |
my $i = 0; | |
is('ab_cd_ef_gh'.xtrans('_' => {$i++}), 'ab0cd1ef2gh', 'basic closure'); | |
$i = 0; | |
my $j = 0; | |
is( | |
'a_b/c_d/e_f'.xtrans('_' => {$i++}, '/' => {$j++}), | |
'a0b0c1d1e2f', | |
'closure pairs work', | |
); | |
}; | |
# y/// is dead | |
eval_dies_ok('$_ = "axbycz"; y/abc/def/', 'y/// does not exist any longer'); | |
# RT #71088 | |
{ | |
lives_ok { "".subst(/x/, "").xtrans() }, | |
'trans on subst output lives'; | |
} | |
is('aaaaabbbbb'.xtrans(['aaa', 'aa', 'bb', 'bbb'] => ['1', '2', '3', '4']), | |
'1243', | |
'longest constant token preferred, regardless of declaration order'); | |
is('foobar'.xtrans(/\w+/ => 'correct', /foo/ => 'RONG'), 'correct', | |
'longest regex token preferred, regardless of declaration order'); | |
is('aaaa'.xtrans(/a/ => '1', /\w/ => '2', /./ => '3'), '1111', | |
'in case of a tie between regex lengths, prefer the first one'); | |
is('ababab'.xtrans([/ab/, 'aba', 'bab', /baba/] => | |
['1', '2', '3', '4' ]), | |
'23', | |
'longest token still holds, even between constant strings and regexes'); | |
} | |
# RT #79778 | |
is("bare\ncode is lovely".xtrans( / \s+ / => ' ' ), 'bare code is lovely', | |
'pair with one regex and one literal string yields right result'); | |
# RT #79778 | |
is("bare\ncode is lovely".trans( / \s+ / => ' ' ), 'bare code is lovely', | |
'pair with one regex and one literal string yields right result'); | |
# vim: ft=perl6 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment