Created
November 4, 2015 18:47
-
-
Save daviddelikat/b8af8e843eb4bddf5f6e to your computer and use it in GitHub Desktop.
webgui url utf8 patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/lib/WebGUI/Session/Url.pm b/lib/WebGUI/Session/Url.pm | |
index 599d0e8..272d283 100644 | |
--- a/lib/WebGUI/Session/Url.pm | |
+++ b/lib/WebGUI/Session/Url.pm | |
@@ -22,6 +22,7 @@ use Scalar::Util qw( weaken ); | |
use WebGUI::International; | |
use WebGUI::Utility; | |
use Encode; | |
+use WebGUI::Text::Utf8; | |
=head1 NAME | |
@@ -402,10 +403,14 @@ The string to make compliant. This is usually a page title or a filename. | |
sub makeCompliant { | |
my $self = shift; | |
my $url = shift; | |
+ if( WebGUI::Text::Utf8::has_nonAsciiChars($url) ) { | |
+ $url = WebGUI::Text::Utf8::convertToAscii( $url ); | |
+ } | |
+ $url =~ s/_html\d?//g; | |
$url =~ s/\s+$//; #removes trailing whitespace | |
$url =~ s/^\s+//; #removes leading whitespace | |
$url =~ s/^\\//; #removes leading slash | |
- $url =~ s/ /-/g; #replaces whitespace with dashes | |
+ $url =~ s/\s+/-/g; #replaces whitespace with dashes | |
$url =~ s/\.$//; #removes trailing period | |
$url =~ s/[^A-Za-z0-9\-\.\_\/]//g; #removes all funky characters | |
$url =~ s/^\///; #removes a preceeding / | |
diff --git a/local/lib/WebGUI/Text/Utf8.pm b/local/lib/WebGUI/Text/Utf8.pm | |
new file mode 100644 | |
index 0000000..c72f952 | |
--- /dev/null | |
+++ b/local/lib/WebGUI/Text/Utf8.pm | |
@@ -0,0 +1,89 @@ | |
+ | |
+package WebGUI::Text::Utf8; | |
+ | |
+my %failedModules; | |
+ | |
+my $testLoadModule = sub { | |
+ my $module = shift; | |
+ die 'bad module name "',$module,'"' unless $module =~ /^(\w+|::)*$/; | |
+ return 0 if exists $failedModules{$module}; | |
+#warn "test $module is loaded\n"; | |
+ #return 1 if $INC{ $module }; | |
+ return 1 if eval '%' . $module . '::' ; # look for a namespace... | |
+#warn "try to load $module\n"; | |
+ return 1 if eval " require $module "; | |
+ $failedModules{$module}++; | |
+warn "failed to load $module\n"; | |
+ return 0; | |
+}; | |
+ | |
+sub has_nonAsciiChars { | |
+ #return $_[0] =~ /[^\x{00}-\x{7f}]/; | |
+ return $_[0] =~ /[^[:ascii:]]/; | |
+} | |
+ | |
+=pod | |
+ | |
+ @steps contains the bits of code that perform each attempt to transform the text` | |
+ | |
+ so... ideally, this would be split into perhaps 2 sets rather than three... | |
+ but, we have to handle every language, and I do not have time to sort out the most common | |
+ utf8 codes for the wholeset of sites. | |
+ the BIG module at the end actually does a pretty good job of loading only what is required | |
+ a little testing could lead us to choose to use only that module... | |
+ but... I feel better about hitting a select list of characters that seem most likely to | |
+ occur on many sites and only load the BIG module for the sites where the whole language | |
+ is utf8... | |
+ on the medium term, I may be convinced to combine the first two modules because there is | |
+ already some overlap. | |
+ | |
+ another option would be to check the site language setting and choose a path based on that... | |
+ | |
+=cut | |
+ | |
+my @steps = ( | |
+ { | |
+ module => 'Text::StripAccents', # this appears to be the smallest of the three.. | |
+ run => sub { | |
+ Text::StripAccents->strip(shift); | |
+ }, | |
+ }, | |
+ { | |
+ module => 'WebGUI::Text::Utf8::Clean', | |
+ run => sub { | |
+ WebGUI::Text::Utf8::Clean->clean(shift); | |
+ }, | |
+ }, | |
+ { | |
+ module => 'Text::Unidecode', # this is definately the largest | |
+ run => sub { | |
+ require Text::Unidecode; # for some reason it fails without this... | |
+ # all the other modules work | |
+ Text::Unidecode::unidecode(shift); | |
+ }, | |
+ }, | |
+); | |
+ | |
+=pod | |
+ | |
+NOTE: | |
+ | |
+this module is coded with a view to small footprint. | |
+each successive module is larger than the previous and will | |
+only get loaded if needed. | |
+ | |
+=cut | |
+ | |
+sub convertToAscii { | |
+ my $string = shift; | |
+ for my $step ( @steps ) { | |
+ next unless $testLoadModule->($step->{module}); | |
+ $string = $step->{run}->($string); | |
+ last unless has_nonAsciiChars($string); | |
+ } | |
+ warn 'failed to fix "'.$string.'"' if has_nonAsciiChars($string); | |
+ return $string; | |
+} | |
+ | |
+1; | |
+ | |
diff --git a/local/lib/WebGUI/Text/Utf8/Clean.pm b/local/lib/WebGUI/Text/Utf8/Clean.pm | |
new file mode 100644 | |
index 0000000..1ac98b5 | |
--- /dev/null | |
+++ b/local/lib/WebGUI/Text/Utf8/Clean.pm | |
@@ -0,0 +1,214 @@ | |
+use strict; | |
+use warnings; | |
+ | |
+package WebGUI::Text::Utf8::Clean; | |
+ | |
+=pod | |
+ | |
+code extracted from | |
+ | |
+http://www.perlmonks.org/?node_id=563765 | |
+ | |
+and modified to make me feel better... | |
+ | |
+=cut | |
+ | |
+our %asciiize = ( | |
+ "\x{00C0}" => "A", "\x{00C1}" => "A", "\x{00C2}" => "A", | |
+ "\x{00C3}" => "A", "\x{00C4}" => "Ae", "\x{00C5}" => "A", | |
+ "\x{00C6}" => "A", "\x{0100}" => "A", "\x{0104}" => "A", | |
+ "\x{0102}" => "A", "\x{00C7}" => "C", "\x{0106}" => "C", | |
+ "\x{010C}" => "C", "\x{0108}" => "C", "\x{010A}" => "C", | |
+ "\x{010E}" => "D", "\x{0110}" => "D", "\x{00C8}" => "E", | |
+ "\x{00C9}" => "E", "\x{00CA}" => "E", "\x{00CB}" => "E", | |
+ "\x{0112}" => "E", "\x{0118}" => "E", "\x{011A}" => "E", | |
+ "\x{0114}" => "E", "\x{0116}" => "E", "\x{011C}" => "G", | |
+ "\x{011E}" => "G", "\x{0120}" => "G", "\x{0122}" => "G", | |
+ "\x{0124}" => "H", "\x{0126}" => "H", "\x{00CC}" => "I", | |
+ "\x{00CD}" => "I", "\x{00CE}" => "I", "\x{00CF}" => "I", | |
+ "\x{012A}" => "I", "\x{0128}" => "I", "\x{012C}" => "I", | |
+ "\x{012E}" => "I", "\x{0130}" => "I", "\x{0132}" => "IJ", | |
+ "\x{0134}" => "J", "\x{0136}" => "K", "\x{013D}" => "K", | |
+ "\x{0139}" => "K", "\x{013B}" => "K", "\x{013F}" => "K", | |
+ "\x{0141}" => "L", "\x{00D1}" => "N", "\x{0143}" => "N", | |
+ "\x{0147}" => "N", "\x{0145}" => "N", "\x{014A}" => "N", | |
+ "\x{00D2}" => "O", "\x{00D3}" => "O", "\x{00D4}" => "O", | |
+ "\x{00D5}" => "O", "\x{00D6}" => "Oe", "\x{00D8}" => "O", | |
+ "\x{014C}" => "O", "\x{0150}" => "O", "\x{014E}" => "O", | |
+ "\x{0152}" => "OE", "\x{0154}" => "R", "\x{0158}" => "R", | |
+ "\x{0156}" => "R", "\x{015A}" => "S", "\x{015E}" => "S", | |
+ "\x{015C}" => "S", "\x{0218}" => "S", "\x{0160}" => "S", | |
+ "\x{0164}" => "T", "\x{0162}" => "T", "\x{0166}" => "T", | |
+ "\x{021A}" => "T", "\x{00D9}" => "U", "\x{00DA}" => "U", | |
+ "\x{00DB}" => "U", "\x{00DC}" => "Ue", "\x{016A}" => "U", | |
+ "\x{016E}" => "U", "\x{0170}" => "U", "\x{016C}" => "U", | |
+ "\x{0168}" => "U", "\x{0172}" => "U", "\x{0174}" => "W", | |
+ "\x{0176}" => "Y", "\x{0178}" => "Y", "\x{00DD}" => "Y", | |
+ "\x{0179}" => "Z", "\x{017B}" => "Z", "\x{017D}" => "Z", | |
+ "\x{00E0}" => "a", "\x{00E1}" => "a", "\x{00E2}" => "a", | |
+ "\x{00E3}" => "a", "\x{00E4}" => "ae", "\x{0101}" => "a", | |
+ "\x{0105}" => "a", "\x{0103}" => "a", "\x{00E5}" => "a", | |
+ "\x{00E6}" => "ae", "\x{00E7}" => "c", "\x{0107}" => "c", | |
+ "\x{010D}" => "c", "\x{0109}" => "c", "\x{010B}" => "c", | |
+ "\x{010F}" => "d", "\x{0111}" => "d", "\x{00E8}" => "e", | |
+ "\x{00E9}" => "e", "\x{00EA}" => "e", "\x{00EB}" => "e", | |
+ "\x{0113}" => "e", "\x{0119}" => "e", "\x{011B}" => "e", | |
+ "\x{0115}" => "e", "\x{0117}" => "e", "\x{0192}" => "f", | |
+ "\x{011D}" => "g", "\x{011F}" => "g", "\x{0121}" => "g", | |
+ "\x{0123}" => "g", "\x{0125}" => "h", "\x{0127}" => "h", | |
+ "\x{00EC}" => "i", "\x{00ED}" => "i", "\x{00EE}" => "i", | |
+ "\x{00EF}" => "i", "\x{012B}" => "i", "\x{0129}" => "i", | |
+ "\x{012D}" => "i", "\x{012F}" => "i", "\x{0131}" => "i", | |
+ "\x{0133}" => "ij", "\x{0135}" => "j", "\x{0137}" => "k", | |
+ "\x{0138}" => "k", "\x{0142}" => "l", "\x{013E}" => "l", | |
+ "\x{013A}" => "l", "\x{013C}" => "l", "\x{0140}" => "l", | |
+ "\x{00F1}" => "n", "\x{0144}" => "n", "\x{0148}" => "n", | |
+ "\x{0146}" => "n", "\x{0149}" => "n", "\x{014B}" => "n", | |
+ "\x{00F2}" => "o", "\x{00F3}" => "o", "\x{00F4}" => "o", | |
+ "\x{00F5}" => "o", "\x{00F6}" => "oe", "\x{00F8}" => "o", | |
+ "\x{014D}" => "o", "\x{0151}" => "o", "\x{014F}" => "o", | |
+ "\x{0153}" => "oe", "\x{0155}" => "r", "\x{0159}" => "r", | |
+ "\x{0157}" => "r", "\x{015B}" => "s", "\x{0161}" => "s", | |
+ "\x{0165}" => "t", "\x{00F9}" => "u", "\x{00FA}" => "u", | |
+ "\x{00FB}" => "u", "\x{00FC}" => "ue", "\x{016B}" => "u", | |
+ "\x{016F}" => "u", "\x{0171}" => "u", "\x{016D}" => "u", | |
+ "\x{0169}" => "u", "\x{0173}" => "u", "\x{0175}" => "w", | |
+ "\x{00FF}" => "y", "\x{00FD}" => "y", "\x{0177}" => "y", | |
+ "\x{017C}" => "z", "\x{017A}" => "z", "\x{017E}" => "z", | |
+ "\x{00DF}" => "ss", "\x{017F}" => "ss", "\x{0391}" => "A", | |
+ "\x{0386}" => "A", "\x{1F08}" => "A", "\x{1F09}" => "A", | |
+ "\x{1F0A}" => "A", "\x{1F0B}" => "A", "\x{1F0C}" => "A", | |
+ "\x{1F0D}" => "A", "\x{1F0E}" => "A", "\x{1F0F}" => "A", | |
+ "\x{1F88}" => "A", "\x{1F89}" => "A", "\x{1F8A}" => "A", | |
+ "\x{1F8B}" => "A", "\x{1F8C}" => "A", "\x{1F8D}" => "A", | |
+ "\x{1F8E}" => "A", "\x{1F8F}" => "A", "\x{1FB8}" => "A", | |
+ "\x{1FB9}" => "A", "\x{1FBA}" => "A", "\x{1FBB}" => "A", | |
+ "\x{1FBC}" => "A", "\x{0392}" => "B", "\x{0393}" => "G", | |
+ "\x{0394}" => "D", "\x{0395}" => "E", "\x{0388}" => "E", | |
+ "\x{1F18}" => "E", "\x{1F19}" => "E", "\x{1F1A}" => "E", | |
+ "\x{1F1B}" => "E", "\x{1F1C}" => "E", "\x{1F1D}" => "E", | |
+ "\x{1FC9}" => "E", "\x{1FC8}" => "E", "\x{0396}" => "Z", | |
+ "\x{0397}" => "I", "\x{0389}" => "I", "\x{1F28}" => "I", | |
+ "\x{1F29}" => "I", "\x{1F2A}" => "I", "\x{1F2B}" => "I", | |
+ "\x{1F2C}" => "I", "\x{1F2D}" => "I", "\x{1F2E}" => "I", | |
+ "\x{1F2F}" => "I", "\x{1F98}" => "I", "\x{1F99}" => "I", | |
+ "\x{1F9A}" => "I", "\x{1F9B}" => "I", "\x{1F9C}" => "I", | |
+ "\x{1F9D}" => "I", "\x{1F9E}" => "I", "\x{1F9F}" => "I", | |
+ "\x{1FCA}" => "I", "\x{1FCB}" => "I", "\x{1FCC}" => "I", | |
+ "\x{0398}" => "TH", "\x{0399}" => "I", "\x{038A}" => "I", | |
+ "\x{03AA}" => "I", "\x{1F38}" => "I", "\x{1F39}" => "I", | |
+ "\x{1F3A}" => "I", "\x{1F3B}" => "I", "\x{1F3C}" => "I", | |
+ "\x{1F3D}" => "I", "\x{1F3E}" => "I", "\x{1F3F}" => "I", | |
+ "\x{1FD8}" => "I", "\x{1FD9}" => "I", "\x{1FDA}" => "I", | |
+ "\x{1FDB}" => "I", "\x{039A}" => "K", "\x{039B}" => "L", | |
+ "\x{039C}" => "M", "\x{039D}" => "N", "\x{039E}" => "KS", | |
+ "\x{039F}" => "O", "\x{038C}" => "O", "\x{1F48}" => "O", | |
+ "\x{1F49}" => "O", "\x{1F4A}" => "O", "\x{1F4B}" => "O", | |
+ "\x{1F4C}" => "O", "\x{1F4D}" => "O", "\x{1FF8}" => "O", | |
+ "\x{1FF9}" => "O", "\x{03A0}" => "P", "\x{03A1}" => "R", | |
+ "\x{1FEC}" => "R", "\x{03A3}" => "S", "\x{03A4}" => "T", | |
+ "\x{03A5}" => "Y", "\x{038E}" => "Y", "\x{03AB}" => "Y", | |
+ "\x{1F59}" => "Y", "\x{1F5B}" => "Y", "\x{1F5D}" => "Y", | |
+ "\x{1F5F}" => "Y", "\x{1FE8}" => "Y", "\x{1FE9}" => "Y", | |
+ "\x{1FEA}" => "Y", "\x{1FEB}" => "Y", "\x{03A6}" => "F", | |
+ "\x{03A7}" => "X", "\x{03A8}" => "PS", "\x{03A9}" => "O", | |
+ "\x{038F}" => "O", "\x{1F68}" => "O", "\x{1F69}" => "O", | |
+ "\x{1F6A}" => "O", "\x{1F6B}" => "O", "\x{1F6C}" => "O", | |
+ "\x{1F6D}" => "O", "\x{1F6E}" => "O", "\x{1F6F}" => "O", | |
+ "\x{1FA8}" => "O", "\x{1FA9}" => "O", "\x{1FAA}" => "O", | |
+ "\x{1FAB}" => "O", "\x{1FAC}" => "O", "\x{1FAD}" => "O", | |
+ "\x{1FAE}" => "O", "\x{1FAF}" => "O", "\x{1FFA}" => "O", | |
+ "\x{1FFB}" => "O", "\x{1FFC}" => "O", "\x{03B1}" => "a", | |
+ "\x{03AC}" => "a", "\x{1F00}" => "a", "\x{1F01}" => "a", | |
+ "\x{1F02}" => "a", "\x{1F03}" => "a", "\x{1F04}" => "a", | |
+ "\x{1F05}" => "a", "\x{1F06}" => "a", "\x{1F07}" => "a", | |
+ "\x{1F80}" => "a", "\x{1F81}" => "a", "\x{1F82}" => "a", | |
+ "\x{1F83}" => "a", "\x{1F84}" => "a", "\x{1F85}" => "a", | |
+ "\x{1F86}" => "a", "\x{1F87}" => "a", "\x{1F70}" => "a", | |
+ "\x{1F71}" => "a", "\x{1FB0}" => "a", "\x{1FB1}" => "a", | |
+ "\x{1FB2}" => "a", "\x{1FB3}" => "a", "\x{1FB4}" => "a", | |
+ "\x{1FB6}" => "a", "\x{1FB7}" => "a", "\x{03B2}" => "b", | |
+ "\x{03B3}" => "g", "\x{03B4}" => "d", "\x{03B5}" => "e", | |
+ "\x{03AD}" => "e", "\x{1F10}" => "e", "\x{1F11}" => "e", | |
+ "\x{1F12}" => "e", "\x{1F13}" => "e", "\x{1F14}" => "e", | |
+ "\x{1F15}" => "e", "\x{1F72}" => "e", "\x{1F73}" => "e", | |
+ "\x{03B6}" => "z", "\x{03B7}" => "i", "\x{03AE}" => "i", | |
+ "\x{1F20}" => "i", "\x{1F21}" => "i", "\x{1F22}" => "i", | |
+ "\x{1F23}" => "i", "\x{1F24}" => "i", "\x{1F25}" => "i", | |
+ "\x{1F26}" => "i", "\x{1F27}" => "i", "\x{1F90}" => "i", | |
+ "\x{1F91}" => "i", "\x{1F92}" => "i", "\x{1F93}" => "i", | |
+ "\x{1F94}" => "i", "\x{1F95}" => "i", "\x{1F96}" => "i", | |
+ "\x{1F97}" => "i", "\x{1F74}" => "i", "\x{1F75}" => "i", | |
+ "\x{1FC2}" => "i", "\x{1FC3}" => "i", "\x{1FC4}" => "i", | |
+ "\x{1FC6}" => "i", "\x{1FC7}" => "i", "\x{03B8}" => "th", | |
+ "\x{03B9}" => "i", "\x{03AF}" => "i", "\x{03CA}" => "i", | |
+ "\x{0390}" => "i", "\x{1F30}" => "i", "\x{1F31}" => "i", | |
+ "\x{1F32}" => "i", "\x{1F33}" => "i", "\x{1F34}" => "i", | |
+ "\x{1F35}" => "i", "\x{1F36}" => "i", "\x{1F37}" => "i", | |
+ "\x{1F76}" => "i", "\x{1F77}" => "i", "\x{1FD0}" => "i", | |
+ "\x{1FD1}" => "i", "\x{1FD2}" => "i", "\x{1FD3}" => "i", | |
+ "\x{1FD6}" => "i", "\x{1FD7}" => "i", "\x{03BA}" => "k", | |
+ "\x{03BB}" => "l", "\x{03BC}" => "m", "\x{03BD}" => "n", | |
+ "\x{03BE}" => "ks", "\x{03BF}" => "o", "\x{03CC}" => "o", | |
+ "\x{1F40}" => "o", "\x{1F41}" => "o", "\x{1F42}" => "o", | |
+ "\x{1F43}" => "o", "\x{1F44}" => "o", "\x{1F45}" => "o", | |
+ "\x{1F78}" => "o", "\x{1F79}" => "o", "\x{03C0}" => "p", | |
+ "\x{03C1}" => "r", "\x{1FE4}" => "r", "\x{1FE5}" => "r", | |
+ "\x{03C3}" => "s", "\x{03C2}" => "s", "\x{03C4}" => "t", | |
+ "\x{03C5}" => "y", "\x{03CD}" => "y", "\x{03CB}" => "y", | |
+ "\x{03B0}" => "y", "\x{1F50}" => "y", "\x{1F51}" => "y", | |
+ "\x{1F52}" => "y", "\x{1F53}" => "y", "\x{1F54}" => "y", | |
+ "\x{1F55}" => "y", "\x{1F56}" => "y", "\x{1F57}" => "y", | |
+ "\x{1F7A}" => "y", "\x{1F7B}" => "y", "\x{1FE0}" => "y", | |
+ "\x{1FE1}" => "y", "\x{1FE2}" => "y", "\x{1FE3}" => "y", | |
+ "\x{1FE6}" => "y", "\x{1FE7}" => "y", "\x{03C6}" => "f", | |
+ "\x{03C7}" => "x", "\x{03C8}" => "ps", "\x{03C9}" => "o", | |
+ "\x{03CE}" => "o", "\x{1F60}" => "o", "\x{1F61}" => "o", | |
+ "\x{1F62}" => "o", "\x{1F63}" => "o", "\x{1F64}" => "o", | |
+ "\x{1F65}" => "o", "\x{1F66}" => "o", "\x{1F67}" => "o", | |
+ "\x{1FA0}" => "o", "\x{1FA1}" => "o", "\x{1FA2}" => "o", | |
+ "\x{1FA3}" => "o", "\x{1FA4}" => "o", "\x{1FA5}" => "o", | |
+ "\x{1FA6}" => "o", "\x{1FA7}" => "o", "\x{1F7C}" => "o", | |
+ "\x{1F7D}" => "o", "\x{1FF2}" => "o", "\x{1FF3}" => "o", | |
+ "\x{1FF4}" => "o", "\x{1FF6}" => "o", "\x{1FF7}" => "o", | |
+ "\x{00A8}" => "", "\x{0385}" => "", "\x{1FBF}" => "", | |
+ "\x{1FFE}" => "", "\x{1FCD}" => "", "\x{1FDD}" => "", | |
+ "\x{1FCE}" => "", "\x{1FDE}" => "", "\x{1FCF}" => "", | |
+ "\x{1FDF}" => "", "\x{1FC0}" => "", "\x{1FC1}" => "", | |
+ "\x{0384}" => "", "\x{1FEE}" => "", "\x{1FEF}" => "", | |
+ "\x{1FED}" => "", "\x{037A}" => "", "\x{1FBD}" => "", | |
+ "\x{0410}" => "A", "\x{0411}" => "B", "\x{0412}" => "V", | |
+ "\x{0413}" => "G", "\x{0414}" => "D", "\x{0415}" => "E", | |
+ "\x{0401}" => "E", "\x{0416}" => "ZH", "\x{0417}" => "Z", | |
+ "\x{0418}" => "I", "\x{0419}" => "I", "\x{041A}" => "K", | |
+ "\x{041B}" => "L", "\x{041C}" => "M", "\x{041D}" => "N", | |
+ "\x{041E}" => "O", "\x{041F}" => "P", "\x{0420}" => "R", | |
+ "\x{0421}" => "S", "\x{0422}" => "T", "\x{0423}" => "U", | |
+ "\x{0424}" => "F", "\x{0425}" => "KH", "\x{0426}" => "TS", | |
+ "\x{0427}" => "CH", "\x{0428}" => "SH", "\x{0429}" => "SHCH", | |
+ "\x{042B}" => "Y", "\x{042D}" => "E", "\x{042E}" => "YU", | |
+ "\x{042F}" => "YA", "\x{0430}" => "A", "\x{0431}" => "B", | |
+ "\x{0432}" => "V", "\x{0433}" => "G", "\x{0434}" => "D", | |
+ "\x{0435}" => "E", "\x{0451}" => "E", "\x{0436}" => "ZH", | |
+ "\x{0437}" => "Z", "\x{0438}" => "I", "\x{0439}" => "I", | |
+ "\x{043A}" => "K", "\x{043B}" => "L", "\x{043C}" => "M", | |
+ "\x{043D}" => "N", "\x{043E}" => "O", "\x{043F}" => "P", | |
+ "\x{0440}" => "R", "\x{0441}" => "S", "\x{0442}" => "T", | |
+ "\x{0443}" => "U", "\x{0444}" => "F", "\x{0445}" => "KH", | |
+ "\x{0446}" => "TS", "\x{0447}" => "CH", "\x{0448}" => "SH", | |
+ "\x{0449}" => "SHCH", "\x{044B}" => "Y", "\x{044D}" => "E", | |
+ "\x{044E}" => "YU", "\x{044F}" => "YA", "\x{042A}" => "", | |
+ "\x{044A}" => "", "\x{042C}" => "", "\x{044C}" => "", | |
+ "\x{00F0}" => "d", "\x{00D0}" => "D", "\x{00FE}" => "th", | |
+ "\x{00DE}" => "TH", | |
+); | |
+ | |
+sub clean { | |
+ shift if $_[0] eq __PACKAGE__; | |
+ my $string = shift; | |
+ $string =~ s/([^\0-\x7f])/exists($asciiize{$1})?$asciiize{$1}:$1/eg; | |
+ return $string; | |
+} | |
+ | |
+1; | |
+ | |
diff --git a/t/Text_Utf8.t b/t/Text_Utf8.t | |
new file mode 100644 | |
index 0000000..40d934f | |
--- /dev/null | |
+++ b/t/Text_Utf8.t | |
@@ -0,0 +1,44 @@ | |
+use utf8; | |
+use open ':encoding(utf8)'; | |
+use open qw/:std :utf8/; | |
+binmode(STDOUT, ":utf8"); | |
+ | |
+use strict; | |
+use warnings; | |
+ | |
+ | |
+=pod | |
+ | |
+test WebGUI::Text::Utf8 | |
+ | |
+ cd /data/WebGUI | |
+ source ../wre/sbin/setenvironment.sh | |
+ perl -Ilib -Ilocal/lib t/Text_Utf8.t | |
+ | |
+=cut | |
+ | |
+use Test::More; | |
+ | |
+use WebGUI::Text::Utf8; | |
+ | |
+my $asciistring = 'resume'; | |
+my $utf8string = 'résumé'; | |
+ | |
+ | |
+ok( !WebGUI::Text::Utf8::has_nonAsciiChars($asciistring), 'ascii string'); | |
+ok( WebGUI::Text::Utf8::has_nonAsciiChars($utf8string ), 'utf8 string'); | |
+ | |
+my @strings = ( | |
+ [ 'résumé','resume' ], | |
+ [ 'Соединенные Штаты Америки открывают новое посольство в Бишкеке', | |
+ 'SOEDINENNYE SHTATY AMERIKI OTKRYVAYUT NOVOE POSOLSTVO V BISHKEKE' ], | |
+ [ '美中执法合作联合联络小组', | |
+ 'Mei Zhong Zhi Fa He Zuo Lian He Lian Luo Xiao Zu ' ], | |
+); | |
+ | |
+for my $a ( @strings ) { | |
+ is( WebGUI::Text::Utf8::convertToAscii($a->[0]),$a->[1], 'successfully convert '.$a->[0]); | |
+} | |
+ | |
+done_testing; | |
+ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment