msjyoo/Counting Clown Emojis.php

## Counting Clown Emojis.php
<?php

// Clown Emoji
// "🤡".length = 2 in Javascript (Firefox)
// '🤡abc'.length = 5 in Javascript (Firefox)

// . = Byte, () = Surrogate pairs
$x = '🤡'; // 4 bytes (UTF-8 (....))
$y = '🤡abc'; // 4 + 3 = 7 bytes (UTF-8 (....) . . .)

// In PHP, strings are simply raw byte streams. Right now $x and $y are stored as UTF-8 because
// I copy pasted them from my browser.

echo "--- These are UTF-8 ---"."\n";
echo "\$x Bytes: ".strlen($x)."\n";
echo "\$x Unicode Codepoint Count (\"characters\"): ".mb_strlen($x, "UTF-8")."\n";
echo "\$x Hex Representation: ".bin2hex($x)."\n";
echo "\$y Bytes: ".strlen($y)."\n";
echo "\$y Unicode Codepoint Count (\"characters\"): ".mb_strlen($y, "UTF-8")."\n";
echo "\$y Hex Representation: ".bin2hex($y)."\n";
echo "--- End ---"."\n";

// Now, lets convert them to UTF-16 where each codepoint is 2 bytes and a surrogate pair is 4 bytes

$x1 = mb_convert_encoding($x, "UTF-16", "UTF-8"); // Still 4 bytes! (UTF-16 (.. ..))
$y1 = mb_convert_encoding($y, "UTF-16", "UTF-8"); // 4 + 6 = 10 bytes (UTF-16 (.. ..) .. .. ..)


echo "--- These are UTF-16 ---"."\n";
echo "\$x1 Bytes: ".strlen($x1)."\n";
echo "\$x1 Unicode Codepoint Count (\"characters\"): ".mb_strlen($x1, "UTF-16")."\n";
echo "\$x1 Hex Representation: ".bin2hex($x1)."\n";
echo "\$y1 Bytes: ".strlen($y1)."\n";
echo "\$y1 Unicode Codepoint Count (\"characters\"): ".mb_strlen($y1, "UTF-16")."\n";
echo "\$y1 Hex Representation: ".bin2hex($y1)."\n";
echo "--- End ---"."\n";


// Now, Javascript's String is sort of like PHP's raw string byte stream, except:
// >>>>>>
// JavaScript treats code units as individual characters, while humans generally think in terms of Unicode characters.
// This has some unfortunate consequences for Unicode characters outside the BMP. Since surrogate pairs consist of
// two code units, '𝌆'.length == 2, even though there’s only one Unicode character there. The individual surrogate
// halves are being exposed as if they were characters: '𝌆' == '\uD834\uDF06'.
// <<<<<< https://mathiasbynens.be/notes/javascript-encoding

// What this basically means is that while proper counting of UTF-16 codepoints would count surrogate pairs (.. ..) as
// length 1, Javascript counts them separately as .. .. = length 2.

// So, our characters $x1 and $y1 are counted in Javascript as:
// $x1 | .. .. = 2
// $y1 | .. .. .. .. .. = 5

// Now it looks obvious that, to emulate Javascript's behaviour we simply need to count the number of bytes
// in the UTF-16 encoding, and divide that by half.


echo "--- These are UTF-16 ---"."\n";
echo "\$x1 Javascript Emulated strlen/2: ".(strlen($x1)/2)."\n";
echo "\$y1 Javascript Emulated strlen/2: ".(strlen($y1)/2)."\n";
echo "--- End ---"."\n";

// And we can see that Javascript's length behaviour is emulated.
	<?php

	// Clown Emoji
	// "🤡".length = 2 in Javascript (Firefox)
	// '🤡abc'.length = 5 in Javascript (Firefox)

	// . = Byte, () = Surrogate pairs
	$x = '🤡'; // 4 bytes (UTF-8 (....))
	$y = '🤡abc'; // 4 + 3 = 7 bytes (UTF-8 (....) . . .)

	// In PHP, strings are simply raw byte streams. Right now $x and $y are stored as UTF-8 because
	// I copy pasted them from my browser.

	echo "--- These are UTF-8 ---"."\n";
	echo "\$x Bytes: ".strlen($x)."\n";
	echo "\$x Unicode Codepoint Count (\"characters\"): ".mb_strlen($x, "UTF-8")."\n";
	echo "\$x Hex Representation: ".bin2hex($x)."\n";
	echo "\$y Bytes: ".strlen($y)."\n";
	echo "\$y Unicode Codepoint Count (\"characters\"): ".mb_strlen($y, "UTF-8")."\n";
	echo "\$y Hex Representation: ".bin2hex($y)."\n";
	echo "--- End ---"."\n";

	// Now, lets convert them to UTF-16 where each codepoint is 2 bytes and a surrogate pair is 4 bytes

	$x1 = mb_convert_encoding($x, "UTF-16", "UTF-8"); // Still 4 bytes! (UTF-16 (.. ..))
	$y1 = mb_convert_encoding($y, "UTF-16", "UTF-8"); // 4 + 6 = 10 bytes (UTF-16 (.. ..) .. .. ..)


	echo "--- These are UTF-16 ---"."\n";
	echo "\$x1 Bytes: ".strlen($x1)."\n";
	echo "\$x1 Unicode Codepoint Count (\"characters\"): ".mb_strlen($x1, "UTF-16")."\n";
	echo "\$x1 Hex Representation: ".bin2hex($x1)."\n";
	echo "\$y1 Bytes: ".strlen($y1)."\n";
	echo "\$y1 Unicode Codepoint Count (\"characters\"): ".mb_strlen($y1, "UTF-16")."\n";
	echo "\$y1 Hex Representation: ".bin2hex($y1)."\n";
	echo "--- End ---"."\n";


	// Now, Javascript's String is sort of like PHP's raw string byte stream, except:
	// >>>>>>
	// JavaScript treats code units as individual characters, while humans generally think in terms of Unicode characters.
	// This has some unfortunate consequences for Unicode characters outside the BMP. Since surrogate pairs consist of
	// two code units, '𝌆'.length == 2, even though there’s only one Unicode character there. The individual surrogate
	// halves are being exposed as if they were characters: '𝌆' == '\uD834\uDF06'.
	// <<<<<< https://mathiasbynens.be/notes/javascript-encoding

	// What this basically means is that while proper counting of UTF-16 codepoints would count surrogate pairs (.. ..) as
	// length 1, Javascript counts them separately as .. .. = length 2.

	// So, our characters $x1 and $y1 are counted in Javascript as:
	// $x1 \| .. .. = 2
	// $y1 \| .. .. .. .. .. = 5

	// Now it looks obvious that, to emulate Javascript's behaviour we simply need to count the number of bytes
	// in the UTF-16 encoding, and divide that by half.


	echo "--- These are UTF-16 ---"."\n";
	echo "\$x1 Javascript Emulated strlen/2: ".(strlen($x1)/2)."\n";
	echo "\$y1 Javascript Emulated strlen/2: ".(strlen($y1)/2)."\n";
	echo "--- End ---"."\n";

	// And we can see that Javascript's length behaviour is emulated.