Skip to content

Instantly share code, notes, and snippets.

@csirac2
Created June 1, 2011 14:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save csirac2/1002361 to your computer and use it in GitHub Desktop.
Save csirac2/1002361 to your computer and use it in GitHub Desktop.
CharacterTests - beginning utf8 stress tests for foswiki
package CharacterTests;
use strict;
use warnings;
# Using utf8, so we can have interesting chars in the source code.
use utf8;
use FoswikiFnTestCase;
our @ISA = qw( FoswikiFnTestCase );
use Foswiki;
use Foswiki::UI::View;
use Error qw( :try );
our $UI_FN;
# Charsets, with a few representative sample words (in lower-case) for that
# charset. On the latin charsets, tried to choose words which began and ended
# with non-ascii chars, to try to exercise wikiword regex/logic
#
# These are of course stored in the source code here in utf8.
my %utf8words = (
# ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ
# wiki, french, french
'iso-8859-1' => ['wiki', 'âcreté', 'çà'],
# Ą˘Ł¤ĽŚ§¨ŠŞŤŹŽŻ°ą˛ł´ľśˇ¸šşťź˝žżŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢßŕá
# âăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙
# wiki, german, polish
'iso-8859-2' => ['wiki', 'überaß', 'łódż' ],
# ‘’£€₯¦§¨©ͺ«¬―°±²³΄΅Ά·ΈΉΊ»Ό½ΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΤΥΦΧΨΩΪΫάέήίΰαβγδ
# εζηθικλμνξοπρςστυφχψωϊϋόύώ
# wiki, greek, greek
'iso-8859-7' => ['wiki', 'φάω', 'πράσινο'],
# ЂЃ‚ѓ„…†‡€‰Љ‹ЊЌЋЏђ‘’“”•–—™љ›њќћџЎўЈ¤Ґ¦§Ё©Є«¬HY®Ї°±Ііґµ¶·ё№є»јЅѕїА
# ВБГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя
# wiki, russian, russian
'cp-1251' => ['wiki', 'зеленый', 'вертолет']
);
# 'native' encodings of the %uf8words.
my %nativewords;
# Some characters of interest, and their ordinal value in various charsets.
# iso-8859-1: Latin-1, Default Foswiki charset
# iso-8859-2: Latin/"Eastern European" (including German)
# iso-8859-7: Latin/Greek
# cp-1251: Cyrillic (MS-Windows)
my %chars = (
'£' => {
unicode => \N{U+00A3},
desc => 'Currency pound',
'iso-8859-1' => 163,
'iso-8859-7' => 163,
},
'Ł' => {
unicode => \N{U+0141},
desc => 'L-stroke',
'iso-8859-2' => 163,
},
'Ј' => {
unicode => \N{U+0408},
desc => 'Je (cyrillic J)',
'cp-1251' => 163
},
'¥' => {
unicode => \N{U+00A5},
desc => 'Currency yen',
'iso-8859-1' => 165,
},
'Ľ' => {
unicode => \N{U+013D},
desc => 'L-caron',
'iso-8859-2' => 165,
},
'Š' => {
unicode => \N{U+0160},
desc => 'S-caron',
'iso-8859-2' => 169,
},
'©' => {
unicode => \N{U+00A9},
desc => '(c)',
'cp-1251' => 169,
'iso-8859-1' => 169,
'iso-8859-7' => 169,
},
'®' => {
unicode => \N{U+00AE},
desc => '(r)',
'iso-8859-1' => 174
},
'°' => {
unicode => \N{U+00B0},
desc => 'Degree symbol',
'cp-1251' => 176,
'iso-8859-1' => 176,
'iso-8859-2' => 176,
'iso-8859-7' => 176
},
'±' => {
unicode => \N{U+00B1},
desc => 'Plus/minus',
'cp-1251' => 177,
'iso-8859-1' => 177,
'iso-8859-7' => 177
},
'µ' => {
unicode => \N{U+00B5},
desc => 'Micro',
'cp-1251' => 181,
'iso-8859-1' => 181
},
'μ' => {
unicode => \N{U+03BC},
desc => 'Mu',
'iso-8859-7' => 236
},
'½' => {
unicode => \N{U+00BD},
desc => 'One half',
'iso-8859-1' => 189,
'iso-8859-7' => 189
},
'Ѕ' => {
unicode => \N{U+0405},
desc => 'Dze (cyrillic S)',
'cp-1251' => 189
},
'Γ' => {
unicode => \N{U+0393},
desc => 'Gamma',
'iso-8859-7' => 195
},
'Г' => {
unicode => \N{U+0413},
desc => 'Ghe (cyrillic Gamma)',
'cp-1251' => 195
},
'é' => {
unicode => \N{U+00E9},
desc => 'e-acute',
'iso-8859-1' => 233,
'iso-8859-2' => 233,
'iso-8859-7' => 233
},
'й' => {
unicode => \N{U+0439},
desc => 'yot (cyrillic, cf. N-caron)',
'cp-1251' => 233
},
'ö' => {
unicode => \N{U+00F6},
desc => 'o-umlaut',
'iso-8859-1' => 246,
'iso-8859-2' => 246
},
'ц' => {
unicode => \N{U+0446},
desc => 'Tse (cyrillic, cf. u)',
'cp-1251' => 246
},
'÷' => {
unicode => \N{U+00F7},
desc => 'Division',
'iso-8859-1' => 247,
'iso-8859-2' => 247
},
'φ' => {
unicode => \N{U+03C6},
desc => 'phi',
'iso-8859-7' => 246
},
'χ' => {
unicode => \N{U+03A7},
desc => 'Chi',
'iso-8859-7' => 247
},
'Я' => {
unicode => \N{U+042F},
desc => 'Ya (cyrillic, cf. reverse R)',
'cp-1251' => 247
},
'щ' => {
unicode => \N{U+0449},
desc => 'Shcha (cyrillic, cf. W)',
'cp-1251' => 249
},
'ü' => {
unicode => \N{U+00FC},
desc => 'u-umlaut',
'iso-8859-1' => 252,
'iso-8859-2' => 252
}
'ώ' => {
unicode => \N{U+03CE},
desc => 'omega-acute',
'iso-8859-7' => 254
}
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment