msjyoo/clown_emoji.php

## clown_emoji.php
<?php

$body = "@ummjackson 🤡 https://i.imgur.com/I32CQ81.jpg";
/////////000000000011111111112222222222333333333344444
/////////012345678901234567890123456789012345678901234
/////////              ^                              ^
/////////            ^ represented in single 'index-unit'

$urlStart = 14; // Inclusive
$urlEnd = 45; // Exclusive (end - start = length)

// Now, since I don't have more data:
// 1. We know that the start index is 0-indexed; and the end index is exclusive
//    because: [{"screen_name":"ummjackson","indices":[0,11]}]
//    (11) is a space character, so the end index must be exclusive.
// 2. This means that the end index that we have (45) must be exclusive too.
// 3. Therefore, the 4 byte emoji was represented in a single 'index unit' (since we can't count any further)
// 4. Therefore, the index is using Unicode codepoints as its unit

// So, notice how the indices are in Unicode codepoints - they don't care for encoding or bytes at all!
// e.g. (our surrogate pair emoji was represented in a single codepoint/'index unit'!)

var_dump(substr($body, $urlStart, ($urlEnd - $urlStart))); // So, this won't work since the indices aren't byte-indexed
var_dump(mb_substr($body, $urlStart, ($urlEnd - $urlStart), "UTF-8")); // This will work since the indices are codepoint-indexed

// But remember how Javascript doesn't use Unicode codepoints but simple 16-bits (UCS-2) as its index?
// ****** ******
// So, now the trick is to convert these Unicode codepoint indices into UTF-16 "code unit" indices (indices of 16-bits/2-bytes)
//                                                                             ^ (aka. UCS-2 codepoint index)

// Now, this should be relatively easy. All we have to do is:
// 1. Scan the text up to the start index. Count the number of surrogate pairs. Increment the start index by the count.
// 2. Scan the text from the start index to the end index. Count the number of surrogate pairs. Increment the end index by the count.

// Let's first convert the text into UTF-16.
$body2 = $body_UTF_16_LITTLE_ENDIAN = mb_convert_encoding($body, "UTF-16", "UTF-8");

if((strlen($body2) % 2) !== 0) {
	throw new \LogicException("Something's wrong. UTF-16 encoding must always have an even number of bytes!");
}

// How do we know if a Unicode codepoint is a surrogate pair in UTF-16? Well, the spec says:
// Lead surrogate (1st byte) has values: 0xD800..0xDBFF
// Trail surrogate (2nd byte) has values: 0xDC00..0xDFFF
//
// In reality though, we only need to check for the lead surrogate because no valid characters
// are in that range, so doesn't actually matter whether the trail surrogate is in range.

$unpackUnsignedShort = function(string $x) {
	if(strlen($x) > 2) { throw new \InvalidArgumentException; }

	$a = ord($x{0});
	$b = ord($x{1});

	return ($a * (2 ** 8)) + $b;
};

$delimiterLeadSurrogateLow = $unpackUnsignedShort(hex2bin("D800")); // 55296
$delimiterLeadSurrogateHigh = $unpackUnsignedShort(hex2bin("DBFF")); // 56319

var_dump($delimiterLeadSurrogateLow, $delimiterLeadSurrogateHigh);

// Now we have the range between 0 - 65536 (2 byte short) within which we can determine
// whether a codepoint is encoded using a surrogate pair. If the 2-byte "code unit" lies between
// 55296 - 56319 (inclusive), then it's a surrogate pair and we'll need to increment the indices for Javascript.

$urlStart_UCS_2 = $urlStart;
$urlEnd_UCS_2 = $urlEnd;

$codePointsCounted = 0; // We must also keep track of codepoints, since that's what the original delimiters are in

// Scanning from the start:
for($i = 0; $i < (strlen($body2) / 2); $i += 2)
{
	// Now we are scanning the UTF-16 indexed body in increments of 2 bytes.

	$a = $body2{$i};
	$b = $body2{$i + 1};

	$value = $unpackUnsignedShort($a.$b);

	if($value >= $delimiterLeadSurrogateLow and $value <= $delimiterLeadSurrogateHigh) {
		// We have detected a surrogate pair!

		if($codePointsCounted < $urlStart) { // Encountered before the start index
			$urlStart_UCS_2 += 1;
			$urlEnd_UCS_2 += 1; // Of course, if we increase the start, the length must increase as well
		} else { // Encountered from the start index
			$urlEnd_UCS_2 += 1;
		}

		$i += 2; // We now skip the trail surrogate.
	}

	$codePointsCounted += 1;
}

// We can see that the indices have been corrected.
var_dump($urlStart_UCS_2, $urlEnd_UCS_2);

// Here we go!
$urlStart = $urlStart_UCS_2;
$urlEnd = $urlEnd_UCS_2;

// If you want to verify that this technique works, try adding more emojis to the original string and
// change the indices accordingly!

?>

<!DOCTYPE html>
Original string:&nbsp;<span id="output1"></span>
<br>
Corrected indices working correctly:&nbsp;<span id="output2"></span>
<script>
	var response = {"id":31,"body":"<?= $body ?>","created_at":"2017-02-14 20:17:58","comments_count":0,"likes_count":0,"entities":{"user_mentions":[{"screen_name":"ummjackson","indices":[0,11]}],"hashtags":[],"urls":[{"url":"https://i.imgur.com/I32CQ81.jpg","indices":[<?= $urlStart ?>,<?= $urlEnd ?>]}]},"user":{"username":"ummjackson","name":"Jackson Palmer","bio":"Marketer and data geek at Adobe by day. Coder and producer by night.","website":"http://ummjackson.com","location":"San Francisco","color":"333333","avatar_url":"https://www.gravatar.com/avatar/88a577ccfb0f22f75dbb54b72d92fea7.jpg?s=200&d=mm"}};
	document.getElementById('output1').innerHTML = body = response.body.toString();

	var start = response.entities.urls[0].indices[0];
	var end = response.entities.urls[0].indices[1];

	document.getElementById('output2').innerHTML = body.slice(start, end /* exclusive */);
</script>

<!--

Note: You should provide the client with indices in both: Unicode codepoints, and the indices in UCS-2 (2-bytes).
That way, it's up to the client whether it can use a proper encoding, or not.

Another thing is that you shouldn't mess with database encodings to satisfy Javascript - I recommend
that you ALWAYS use UTF-8 for ALL persistence. You REALLY shouldn't use any other encoding than UTF-8.

For explanation: http://utf8everywhere.org/

Also: You can of course do the same processing on the client side. But if possible: why use up precious browser time,
when you can pre-compute the values once and serve it to everyone? Especially for long strings. Maybe have database
columns for both unicode codepoint indices and UCS-2 indices?

Copyright 2017 Michael Yoo <michael@yoo.id.au> Released under MIT or Apache-2.0 or BSD-3 Clause, at your discretion.

-->
	<?php

	$body = "@ummjackson 🤡 https://i.imgur.com/I32CQ81.jpg";
	/////////000000000011111111112222222222333333333344444
	/////////012345678901234567890123456789012345678901234
	///////// ^ ^
	///////// ^ represented in single 'index-unit'

	$urlStart = 14; // Inclusive
	$urlEnd = 45; // Exclusive (end - start = length)

	// Now, since I don't have more data:
	// 1. We know that the start index is 0-indexed; and the end index is exclusive
	// because: [{"screen_name":"ummjackson","indices":[0,11]}]
	// (11) is a space character, so the end index must be exclusive.
	// 2. This means that the end index that we have (45) must be exclusive too.
	// 3. Therefore, the 4 byte emoji was represented in a single 'index unit' (since we can't count any further)
	// 4. Therefore, the index is using Unicode codepoints as its unit

	// So, notice how the indices are in Unicode codepoints - they don't care for encoding or bytes at all!
	// e.g. (our surrogate pair emoji was represented in a single codepoint/'index unit'!)

	var_dump(substr($body, $urlStart, ($urlEnd - $urlStart))); // So, this won't work since the indices aren't byte-indexed
	var_dump(mb_substr($body, $urlStart, ($urlEnd - $urlStart), "UTF-8")); // This will work since the indices are codepoint-indexed

	// But remember how Javascript doesn't use Unicode codepoints but simple 16-bits (UCS-2) as its index?
	// **** ****
	// So, now the trick is to convert these Unicode codepoint indices into UTF-16 "code unit" indices (indices of 16-bits/2-bytes)
	// ^ (aka. UCS-2 codepoint index)

	// Now, this should be relatively easy. All we have to do is:
	// 1. Scan the text up to the start index. Count the number of surrogate pairs. Increment the start index by the count.
	// 2. Scan the text from the start index to the end index. Count the number of surrogate pairs. Increment the end index by the count.

	// Let's first convert the text into UTF-16.
	$body2 = $body_UTF_16_LITTLE_ENDIAN = mb_convert_encoding($body, "UTF-16", "UTF-8");

	if((strlen($body2) % 2) !== 0) {
	throw new \LogicException("Something's wrong. UTF-16 encoding must always have an even number of bytes!");
	}

	// How do we know if a Unicode codepoint is a surrogate pair in UTF-16? Well, the spec says:
	// Lead surrogate (1st byte) has values: 0xD800..0xDBFF
	// Trail surrogate (2nd byte) has values: 0xDC00..0xDFFF
	//
	// In reality though, we only need to check for the lead surrogate because no valid characters
	// are in that range, so doesn't actually matter whether the trail surrogate is in range.

	$unpackUnsignedShort = function(string $x) {
	if(strlen($x) > 2) { throw new \InvalidArgumentException; }

	$a = ord($x{0});
	$b = ord($x{1});

	return ($a * (2 ** 8)) + $b;
	};

	$delimiterLeadSurrogateLow = $unpackUnsignedShort(hex2bin("D800")); // 55296
	$delimiterLeadSurrogateHigh = $unpackUnsignedShort(hex2bin("DBFF")); // 56319

	var_dump($delimiterLeadSurrogateLow, $delimiterLeadSurrogateHigh);

	// Now we have the range between 0 - 65536 (2 byte short) within which we can determine
	// whether a codepoint is encoded using a surrogate pair. If the 2-byte "code unit" lies between
	// 55296 - 56319 (inclusive), then it's a surrogate pair and we'll need to increment the indices for Javascript.

	$urlStart_UCS_2 = $urlStart;
	$urlEnd_UCS_2 = $urlEnd;

	$codePointsCounted = 0; // We must also keep track of codepoints, since that's what the original delimiters are in

	// Scanning from the start:
	for($i = 0; $i < (strlen($body2) / 2); $i += 2)
	{
	// Now we are scanning the UTF-16 indexed body in increments of 2 bytes.

	$a = $body2{$i};
	$b = $body2{$i + 1};

	$value = $unpackUnsignedShort($a.$b);

	if($value >= $delimiterLeadSurrogateLow and $value <= $delimiterLeadSurrogateHigh) {
	// We have detected a surrogate pair!

	if($codePointsCounted < $urlStart) { // Encountered before the start index
	$urlStart_UCS_2 += 1;
	$urlEnd_UCS_2 += 1; // Of course, if we increase the start, the length must increase as well
	} else { // Encountered from the start index
	$urlEnd_UCS_2 += 1;
	}

	$i += 2; // We now skip the trail surrogate.
	}

	$codePointsCounted += 1;
	}

	// We can see that the indices have been corrected.
	var_dump($urlStart_UCS_2, $urlEnd_UCS_2);

	// Here we go!
	$urlStart = $urlStart_UCS_2;
	$urlEnd = $urlEnd_UCS_2;

	// If you want to verify that this technique works, try adding more emojis to the original string and
	// change the indices accordingly!

	?>

	<!DOCTYPE html>
	Original string: <span id="output1"></span>
	<br>
	Corrected indices working correctly: <span id="output2"></span>
	<script>
	var response = {"id":31,"body":"<?= $body ?>","created_at":"2017-02-14 20:17:58","comments_count":0,"likes_count":0,"entities":{"user_mentions":[{"screen_name":"ummjackson","indices":[0,11]}],"hashtags":[],"urls":[{"url":"https://i.imgur.com/I32CQ81.jpg","indices":[<?= $urlStart ?>,<?= $urlEnd ?>]}]},"user":{"username":"ummjackson","name":"Jackson Palmer","bio":"Marketer and data geek at Adobe by day. Coder and producer by night.","website":"http://ummjackson.com","location":"San Francisco","color":"333333","avatar_url":"https://www.gravatar.com/avatar/88a577ccfb0f22f75dbb54b72d92fea7.jpg?s=200&d=mm"}};
	document.getElementById('output1').innerHTML = body = response.body.toString();

	var start = response.entities.urls[0].indices[0];
	var end = response.entities.urls[0].indices[1];

	document.getElementById('output2').innerHTML = body.slice(start, end /* exclusive */);
	</script>

	<!--

	Note: You should provide the client with indices in both: Unicode codepoints, and the indices in UCS-2 (2-bytes).
	That way, it's up to the client whether it can use a proper encoding, or not.

	Another thing is that you shouldn't mess with database encodings to satisfy Javascript - I recommend
	that you ALWAYS use UTF-8 for ALL persistence. You REALLY shouldn't use any other encoding than UTF-8.

	For explanation: http://utf8everywhere.org/

	Also: You can of course do the same processing on the client side. But if possible: why use up precious browser time,
	when you can pre-compute the values once and serve it to everyone? Especially for long strings. Maybe have database
	columns for both unicode codepoint indices and UCS-2 indices?

	Copyright 2017 Michael Yoo <michael@yoo.id.au> Released under MIT or Apache-2.0 or BSD-3 Clause, at your discretion.

	-->