Last active
October 13, 2015 00:08
-
-
Save triplepoint/4108015 to your computer and use it in GitHub Desktop.
PHP multibyte string demonstration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* The point of this demonstration is to show how easily multibyte strings can be mangled in PHP. | |
* | |
* A random 50-character multibyte string is generated, and then several string functions defined in | |
* $commands are eval()'ed and the output is shown. | |
* | |
* The point to notice is that the non-mb_* commands will have inaccurate or mangling behavior with | |
* multibyte strings. Also important to note is that if the mb_* functions' encoding doesn't match the | |
* encoding of the string, they will still likely have unintended behavior. | |
* | |
* All in all, it's still very easy to screw up multibyte strings in PHP. | |
*/ | |
// Encoding to use to generate the test string | |
$test_string_encoding = 'UTF-8'; | |
// Generate the test string | |
$unicode_test_string = randomTestString(50, $test_string_encoding); | |
// Pick out a random character that is in the test string. This gets used to test things like str_pos(). | |
$character_in_string = randomCharacterInString($unicode_test_string, $test_string_encoding); | |
/** | |
* Define the php commands to display | |
* | |
* Since these get eval()'ed, this is obviously dangerous. Don't do anything stupid. | |
*/ | |
$commands = array( | |
'mb_internal_encoding("'.$test_string_encoding.'")', // This forces the mb_* functions to use the same encoding that was used to generate the test string. | |
// You can change this to something else to force a mismatch, or comment this out to see what the default | |
// server configuration is. | |
'mb_internal_encoding()', // Just to print what the encoding is set | |
'$unicode_test_string', // The generated test string, untouched | |
'strlen($unicode_test_string)', // Strlen has no idea what's going on and just counts bytes instead of characters | |
'mb_strlen($unicode_test_string)', // mb_strlen does a better job, unless the encoding is mismatched. Then it's just as bad as strlen() | |
'strpos($unicode_test_string, "'.$character_in_string.'")', // Strpos() is broken in a similar way to strlen() ... | |
'mb_strpos($unicode_test_string, "'.$character_in_string.'")', // ... and mb_strpos() is just as dependent on the encoding being set properly as mb_strlen(). | |
'substr($unicode_test_string, -5)', // substr() mangles | |
'mb_substr($unicode_test_string, -5)', // with the proper encoding, mb_substr() gets it right | |
); | |
$evaluations = array_map( | |
function ($command) use ($unicode_test_string) { | |
eval('$return = '.$command.';'); | |
return $return; | |
}, | |
$commands | |
); | |
$results = array_combine($commands, $evaluations); | |
display_results($results, $test_string_encoding); | |
// ----------------------------------------------------------------------------- | |
/** | |
* Generate a random multibyte string of the specified length | |
* | |
* Note that this is a pretty expensive function, since we're using the | |
* first 0xFFFF characters in the encoding set in order to expose the | |
* differences between encodings in the range over 0xFF. | |
* | |
* @param integer $length the character count of the generated string | |
* @param string $encoding the encoding for the passed string | |
* | |
* @return string the multibyte test string | |
*/ | |
function randomTestString($length, $encoding) | |
{ | |
// The first 0xFFFF characters in the encoding | |
$characters = range(0x0000, 0xFFFF); | |
// Convert hex values into character strings | |
$characters = array_map( | |
function ($value) use ($encoding) { | |
return mb_convert_encoding("&#$value;", $encoding, 'HTML-ENTITIES'); | |
}, | |
$characters | |
); | |
// Filter for only letters, from any language | |
$characters = array_filter( | |
$characters, | |
function ($value) { | |
$match = preg_match('/[\p{L}]/u', $value); | |
return ($match > 0); | |
} | |
); | |
$generatedString = ''; | |
for ($i = 0; $i < $length; ++$i) { | |
$generatedString .= $characters[array_rand($characters)]; | |
} | |
return $generatedString; | |
} | |
/** | |
* Given a multibyte string, identify a random single character that is in the string | |
* | |
* @param string $string a string from which to extract the character | |
* @param string $encoding the encoding for the passed string | |
* | |
* @return string a single character that is in the passed string | |
*/ | |
function randomCharacterInString($string, $encoding) | |
{ | |
$character_position = rand(0, mb_strlen($string, $encoding) - 1); | |
$character_in_string = mb_substr($string, $character_position, 1, $encoding); | |
return $character_in_string; | |
} | |
/** | |
* Display the results in a reasonable chart | |
* | |
* @param array $results The test results, with commands as indexes and their evaluations as values | |
*/ | |
function display_results(array $results, $encoding) | |
{ | |
?><!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta http-equiv="Content-Type" content="text/html; charset=<?php echo $encoding; ?>" /> | |
<style type="text/css"> | |
body { | |
background-color: gray; | |
color: black; | |
font-family: sans-serif; | |
} | |
td, th { | |
vertical-align: top; | |
} | |
td { | |
font-family: monospace; | |
padding: 4px; | |
font-size: 1.25em; | |
} | |
.command { | |
text-align: right; | |
color: yellow; | |
} | |
.result { | |
color: white; | |
} | |
</style> | |
</head> | |
<body> | |
<h1>PHP String Operations on Unicode Strings: A Demonstration</h1> | |
<table> | |
<thead> | |
<tr> | |
<th>Command</th> | |
<th>Result</th> | |
</tr> | |
</thead> | |
<tbody> | |
<?php | |
foreach ($results as $command => $result) { | |
?> | |
<tr> | |
<td class="command"><?php echo $command; ?></td> | |
<td class="result"><?php echo $result; ?></td> | |
</tr> | |
<?php | |
} | |
?> | |
</tbody> | |
</table> | |
</body> | |
</html> | |
<?php | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment