Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save masakielastic/5793665 to your computer and use it in GitHub Desktop.
Save masakielastic/5793665 to your computer and use it in GitHub Desktop.
mb_convert_encoding breaks well-formed character (PHP 5.5RC3, Mac OSX 10.8).
<?php
// https://en.wikipedia.org/wiki/UTF-8#Examples
// 2-byte character: U+00A2 (CENT SIGN)
mb_substitute_character(0xFFFD);
$data = [
// ill-formed
"\xC2\xA2"."\xC2\xA2"."\xC2",
// ill-formed
"\xC2\xA2"."\xC2\xA2". "\xA2",
];
$expected = [
// U+FFFD
"\xC2\xA2"."\xC2\xA2"."\xEF\xBF\xBD",
// U+FFFD
"\xC2\xA2"."\xC2\xA2"."\xEF\xBF\xBD",
];
$expected2 = [
"\xC2\xA2"."\xC2\xA2",
// U+FFFD
"\xC2\xA2"."\xC2\xA2"."\xEF\xBF\xBD",
];
var_dump(
'2-byte character: U+00A2 (CENT SIGN)',
[
$expected === array_map(function($str) { return UConverter::transcode($str, 'UTF-8', 'UTF-8'); }, $data),
$expected === array_map(function($str) { return htmlspecialchars($str, ENT_SUBSTITUTE, 'UTF-8'); }, $data),
$expected2 === array_map(function($str) { return mb_convert_encoding($str, 'UTF-8', 'UTF-8'); }, $data)
],[
// mb_convert_encoding deletes trailing byte without replacing U+FFFD
$expected[0] !== $expected2[0],
$expected[1] === $expected2[1]
]);
<?php
// https://en.wikipedia.org/wiki/UTF-8#Examples
// 3-byte character: U+20AC (EURO SIGN)
mb_substitute_character(0xFFFD);
$data2 = [
// ill-formed
"\xE2\x82" ."\xE2\x82\xAC"."\xE2\x82\xAC",
// ill-formed
"\x82\xAC"."\xE2\x82\xAC"."\xE2\x82\xAC",
// ill-formed
"\xE2\x82\xAC"."\xE2\x82\xAC"."\xE2\x82",
// ill-formed
"\xE2\x82\xAC"."\xE2\x82\xAC". "\x82\xAC",
];
$expected3 = [
// U+FFFD
"\xEF\xBF\xBD"."\xE2\x82\xAC"."\xE2\x82\xAC",
// U+FFFD U+FFFD
"\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xE2\x82\xAC"."\xE2\x82\xAC",
// U+FFFD
"\xE2\x82\xAC"."\xE2\x82\xAC"."\xEF\xBF\xBD",
// U+FFFD U+FFFD
"\xE2\x82\xAC"."\xE2\x82\xAC"."\xEF\xBF\xBD"."\xEF\xBF\xBD"
];
$expected4 = [
// U+FFFD U+FFFD U+FFFD
"\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xE2\x82\xAC",
// U+FFFD
"\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xE2\x82\xAC"."\xE2\x82\xAC",
"\xE2\x82\xAC"."\xE2\x82\xAC",
// U+FFFD U+FFFD
"\xE2\x82\xAC"."\xE2\x82\xAC"."\xEF\xBF\xBD"."\xEF\xBF\xBD"
];
var_dump(
'3-byte character: U+20AC (EURO SIGN)',
[
$expected3 === array_map(function($str) { return UConverter::transcode($str, 'UTF-8', 'UTF-8'); }, $data2),
$expected3 === array_map(function($str) { return htmlspecialchars($str, ENT_SUBSTITUTE, 'UTF-8'); }, $data2),
$expected4 === array_map(function($str) { return mb_convert_encoding($str, 'UTF-8', 'UTF-8'); }, $data2)
],[
// mb_convert_encoding breaks well-formed character
$expected3[0] !== $expected4[0],
$expected3[1] === $expected4[1],
// mb_convert_encoding deletes trailing bytes without replacing U+FFFD
$expected3[2] !== $expected4[2],
$expected3[3] === $expected4[3]
]);
<?php
// https://en.wikipedia.org/wiki/UTF-8#Examples
// 4-byte character: U+24B62 (Unicode Han Character)
mb_substitute_character(0xFFFD);
$data3 = [
// ill-formed
"\xF0\xA4\xAD" ."\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2",
// ill-formed
"\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2",
// ill-formed
"\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD",
// ill-formed
"\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2". "\xA4\xAD\xA2",
];
$expected5 = [
// U+FFFD
"\xEF\xBF\xBD"."\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2",
// U+FFFD U+FFFD U+FFFD
"\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2",
// U+FFFD
"\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2"."\xEF\xBF\xBD",
// U+FFFD U+FFFD U+FFFD
"\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD"
];
$expected6 = [
// U+FFFD U+FFFD U+FFFD U+FFFD
"\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xF0\xA4\xAD\xA2",
// U+FFFD U+FFFD U+FFFD
"\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2",
"\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2",
// U+FFFD U+FFFD U+FFFD
"\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD"
];
var_dump(
'4-byte character: U+24B62 (Unicode Han Character)',
[
$expected5 === array_map(function($str) { return UConverter::transcode($str, 'UTF-8', 'UTF-8'); }, $data3),
$expected5 === array_map(function($str) { return htmlspecialchars($str, ENT_SUBSTITUTE, 'UTF-8'); }, $data3),
$expected6 === array_map(function($str) { return mb_convert_encoding($str, 'UTF-8', 'UTF-8'); }, $data3)
],[
// mb_convert_encoding breaks well-formed character
$expected5[0] !== $expected6[0],
$expected5[1] === $expected6[1],
// mb_convert_encoding deletes trailing bytes without replacing U+FFFD
$expected5[2] !== $expected6[2],
$expected5[3] === $expected6[3]
]);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment