Created
June 1, 2011 14:14
-
-
Save csirac2/1002361 to your computer and use it in GitHub Desktop.
CharacterTests - beginning utf8 stress tests for foswiki
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package CharacterTests; | |
use strict; | |
use warnings; | |
# Using utf8, so we can have interesting chars in the source code. | |
use utf8; | |
use FoswikiFnTestCase; | |
our @ISA = qw( FoswikiFnTestCase ); | |
use Foswiki; | |
use Foswiki::UI::View; | |
use Error qw( :try ); | |
our $UI_FN; | |
# Charsets, with a few representative sample words (in lower-case) for that | |
# charset. On the latin charsets, tried to choose words which began and ended | |
# with non-ascii chars, to try to exercise wikiword regex/logic | |
# | |
# These are of course stored in the source code here in utf8. | |
my %utf8words = ( | |
# ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ | |
# wiki, french, french | |
'iso-8859-1' => ['wiki', 'âcreté', 'çà'], | |
# Ą˘Ł¤ĽŚ§¨ŠŞŤŹŽŻ°ą˛ł´ľśˇ¸šşťź˝žżŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢßŕá | |
# âăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙ | |
# wiki, german, polish | |
'iso-8859-2' => ['wiki', 'überaß', 'łódż' ], | |
# ‘’£€₯¦§¨©ͺ«¬―°±²³΄΅Ά·ΈΉΊ»Ό½ΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΤΥΦΧΨΩΪΫάέήίΰαβγδ | |
# εζηθικλμνξοπρςστυφχψωϊϋόύώ | |
# wiki, greek, greek | |
'iso-8859-7' => ['wiki', 'φάω', 'πράσινο'], | |
# ЂЃ‚ѓ„…†‡€‰Љ‹ЊЌЋЏђ‘’“”•–—™љ›њќћџЎўЈ¤Ґ¦§Ё©Є«¬HY®Ї°±Ііґµ¶·ё№є»јЅѕїА | |
# ВБГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя | |
# wiki, russian, russian | |
'cp-1251' => ['wiki', 'зеленый', 'вертолет'] | |
); | |
# 'native' encodings of the %uf8words. | |
my %nativewords; | |
# Some characters of interest, and their ordinal value in various charsets. | |
# iso-8859-1: Latin-1, Default Foswiki charset | |
# iso-8859-2: Latin/"Eastern European" (including German) | |
# iso-8859-7: Latin/Greek | |
# cp-1251: Cyrillic (MS-Windows) | |
my %chars = ( | |
'£' => { | |
unicode => \N{U+00A3}, | |
desc => 'Currency pound', | |
'iso-8859-1' => 163, | |
'iso-8859-7' => 163, | |
}, | |
'Ł' => { | |
unicode => \N{U+0141}, | |
desc => 'L-stroke', | |
'iso-8859-2' => 163, | |
}, | |
'Ј' => { | |
unicode => \N{U+0408}, | |
desc => 'Je (cyrillic J)', | |
'cp-1251' => 163 | |
}, | |
'¥' => { | |
unicode => \N{U+00A5}, | |
desc => 'Currency yen', | |
'iso-8859-1' => 165, | |
}, | |
'Ľ' => { | |
unicode => \N{U+013D}, | |
desc => 'L-caron', | |
'iso-8859-2' => 165, | |
}, | |
'Š' => { | |
unicode => \N{U+0160}, | |
desc => 'S-caron', | |
'iso-8859-2' => 169, | |
}, | |
'©' => { | |
unicode => \N{U+00A9}, | |
desc => '(c)', | |
'cp-1251' => 169, | |
'iso-8859-1' => 169, | |
'iso-8859-7' => 169, | |
}, | |
'®' => { | |
unicode => \N{U+00AE}, | |
desc => '(r)', | |
'iso-8859-1' => 174 | |
}, | |
'°' => { | |
unicode => \N{U+00B0}, | |
desc => 'Degree symbol', | |
'cp-1251' => 176, | |
'iso-8859-1' => 176, | |
'iso-8859-2' => 176, | |
'iso-8859-7' => 176 | |
}, | |
'±' => { | |
unicode => \N{U+00B1}, | |
desc => 'Plus/minus', | |
'cp-1251' => 177, | |
'iso-8859-1' => 177, | |
'iso-8859-7' => 177 | |
}, | |
'µ' => { | |
unicode => \N{U+00B5}, | |
desc => 'Micro', | |
'cp-1251' => 181, | |
'iso-8859-1' => 181 | |
}, | |
'μ' => { | |
unicode => \N{U+03BC}, | |
desc => 'Mu', | |
'iso-8859-7' => 236 | |
}, | |
'½' => { | |
unicode => \N{U+00BD}, | |
desc => 'One half', | |
'iso-8859-1' => 189, | |
'iso-8859-7' => 189 | |
}, | |
'Ѕ' => { | |
unicode => \N{U+0405}, | |
desc => 'Dze (cyrillic S)', | |
'cp-1251' => 189 | |
}, | |
'Γ' => { | |
unicode => \N{U+0393}, | |
desc => 'Gamma', | |
'iso-8859-7' => 195 | |
}, | |
'Г' => { | |
unicode => \N{U+0413}, | |
desc => 'Ghe (cyrillic Gamma)', | |
'cp-1251' => 195 | |
}, | |
'é' => { | |
unicode => \N{U+00E9}, | |
desc => 'e-acute', | |
'iso-8859-1' => 233, | |
'iso-8859-2' => 233, | |
'iso-8859-7' => 233 | |
}, | |
'й' => { | |
unicode => \N{U+0439}, | |
desc => 'yot (cyrillic, cf. N-caron)', | |
'cp-1251' => 233 | |
}, | |
'ö' => { | |
unicode => \N{U+00F6}, | |
desc => 'o-umlaut', | |
'iso-8859-1' => 246, | |
'iso-8859-2' => 246 | |
}, | |
'ц' => { | |
unicode => \N{U+0446}, | |
desc => 'Tse (cyrillic, cf. u)', | |
'cp-1251' => 246 | |
}, | |
'÷' => { | |
unicode => \N{U+00F7}, | |
desc => 'Division', | |
'iso-8859-1' => 247, | |
'iso-8859-2' => 247 | |
}, | |
'φ' => { | |
unicode => \N{U+03C6}, | |
desc => 'phi', | |
'iso-8859-7' => 246 | |
}, | |
'χ' => { | |
unicode => \N{U+03A7}, | |
desc => 'Chi', | |
'iso-8859-7' => 247 | |
}, | |
'Я' => { | |
unicode => \N{U+042F}, | |
desc => 'Ya (cyrillic, cf. reverse R)', | |
'cp-1251' => 247 | |
}, | |
'щ' => { | |
unicode => \N{U+0449}, | |
desc => 'Shcha (cyrillic, cf. W)', | |
'cp-1251' => 249 | |
}, | |
'ü' => { | |
unicode => \N{U+00FC}, | |
desc => 'u-umlaut', | |
'iso-8859-1' => 252, | |
'iso-8859-2' => 252 | |
} | |
'ώ' => { | |
unicode => \N{U+03CE}, | |
desc => 'omega-acute', | |
'iso-8859-7' => 254 | |
} | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment