Skip to content

Instantly share code, notes, and snippets.

@hanabokuro
Last active December 22, 2015 03:29
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hanabokuro/6410769 to your computer and use it in GitHub Desktop.
Save hanabokuro/6410769 to your computer and use it in GitHub Desktop.
package AvoidCoreTextBug;
use strict;
use warnings;
our %SEPARATORS = map { $_ => 1 } (0x20, 0xa0,
0x1680, 0x180e,
0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200a,
0x202f, 0x205f,
0x3000,
0xad,
);
sub filter {
my $str = shift; # must be utf8 string
my $dont_convert_entity_reference = shift;
$str =~ s/((?:[\x{600}-\x{6ff}\x{750}-\x{77f}\x{8a0}-\x{8ff}\x{fb50}-\x{fdff}\x{7e70}-\x{feff}\x{10e60}-\x{10e7f}\x{1ee00}-\x{1eeff}\x{590}-\x{05FF}\x{FB1D}-\x{FB4F}\x{700}-\x{74f}\x{200f}\x{202b}\x{202e}]
|
&(?:rlm|rle|rlo);?
|
&\#(x[0-9a-f]+|[0-9]+);?
)([\x{00}-\x{2f}\x{3a}-\x{40}\x{5b}-\x{60}\x{7b}-\x{7f}]*?)(&(?:nbsp|shy);?|&\#(x[0-9a-f]+|[0-9]+);?|[\x{00}-\x{20}\x{7f}\x{ad}\p{Separator}]))/add_lrm($dont_convert_entity_reference,$1,$2,$3,$4,$5)/xieg;
$str;
}
sub add_lrm {
my($dont_convert_entity_reference,$original, $code, $glue, $separator, $separator_code) = @_;
return $original if($dont_convert_entity_reference && ($original =~ /^&/ || $separator =~ /^&/));
if($original =~ /^&#/){
$code = hex($1) if($code =~ /^x(.*)/i);
$code += 0;
# http://en.wikipedia.org/wiki/Arabic_script_in_Unicode
# http://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet
# http://www.fileformat.info/info/unicode/block/syriac/index.htm
if ((0x600 <= $code && $code <= 0x6ff) # Arabic
||
(0x750 <= $code && $code <= 0x77f) # Arabic Supplement
||
(0x8a0 <= $code && $code <= 0x8ff) # Arabic Extended-A
||
(0xfb50 <= $code && $code <= 0xfdff) # Arabic Presentation Forms-A
||
(0x7e70 <= $code && $code <= 0xfeff) # Arabic Presentation Forms-B
||
(0x10e60 <= $code && $code <= 0x10e7f) # Rumi Numeral Symbols
||
(0x1ee00 <= $code && $code <= 0x1eeff) # Arabic Mathematical Alphabetic Symbols
||
(0x590 <= $code && $code <= 0x05FF) # Hebrew
||
(0xFB1D <= $code && $code <= 0xFB4F) # Hebrew : Alphabetic Presentation Forms
||
(0x700 <= $code && $code <= 0x74f) # Syriac
||
($code == 0x200f || $code == 0x202b || $code == 0x202e) # rlm rle rlo
) {
# fall through
}else{
return $original;
}
}
if($separator =~ /^&#/){
$separator_code = hex($1) if($separator_code =~ /^x(.*)/i);
$separator_code += 0;
if($SEPARATORS{$separator_code}){
# fall through
}else{
return $original;
}
}
$original . "\x{200e}";
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment