Created
February 15, 2017 18:56
-
-
Save msjyoo/cef5d6705c2d38b666741ce6f9198115 to your computer and use it in GitHub Desktop.
Example code for displaying Emojis with slices.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$body = "@ummjackson 🤡 https://i.imgur.com/I32CQ81.jpg"; | |
/////////000000000011111111112222222222333333333344444 | |
/////////012345678901234567890123456789012345678901234 | |
///////// ^ ^ | |
///////// ^ represented in single 'index-unit' | |
$urlStart = 14; // Inclusive | |
$urlEnd = 45; // Exclusive (end - start = length) | |
// Now, since I don't have more data: | |
// 1. We know that the start index is 0-indexed; and the end index is exclusive | |
// because: [{"screen_name":"ummjackson","indices":[0,11]}] | |
// (11) is a space character, so the end index must be exclusive. | |
// 2. This means that the end index that we have (45) must be exclusive too. | |
// 3. Therefore, the 4 byte emoji was represented in a single 'index unit' (since we can't count any further) | |
// 4. Therefore, the index is using Unicode codepoints as its unit | |
// So, notice how the indices are in Unicode codepoints - they don't care for encoding or bytes at all! | |
// e.g. (our surrogate pair emoji was represented in a single codepoint/'index unit'!) | |
var_dump(substr($body, $urlStart, ($urlEnd - $urlStart))); // So, this won't work since the indices aren't byte-indexed | |
var_dump(mb_substr($body, $urlStart, ($urlEnd - $urlStart), "UTF-8")); // This will work since the indices are codepoint-indexed | |
// But remember how Javascript doesn't use Unicode codepoints but simple 16-bits (UCS-2) as its index? | |
// ****** ****** | |
// So, now the trick is to convert these Unicode codepoint indices into UTF-16 "code unit" indices (indices of 16-bits/2-bytes) | |
// ^ (aka. UCS-2 codepoint index) | |
// Now, this should be relatively easy. All we have to do is: | |
// 1. Scan the text up to the start index. Count the number of surrogate pairs. Increment the start index by the count. | |
// 2. Scan the text from the start index to the end index. Count the number of surrogate pairs. Increment the end index by the count. | |
// Let's first convert the text into UTF-16. | |
$body2 = $body_UTF_16_LITTLE_ENDIAN = mb_convert_encoding($body, "UTF-16", "UTF-8"); | |
if((strlen($body2) % 2) !== 0) { | |
throw new \LogicException("Something's wrong. UTF-16 encoding must always have an even number of bytes!"); | |
} | |
// How do we know if a Unicode codepoint is a surrogate pair in UTF-16? Well, the spec says: | |
// Lead surrogate (1st byte) has values: 0xD800..0xDBFF | |
// Trail surrogate (2nd byte) has values: 0xDC00..0xDFFF | |
// | |
// In reality though, we only need to check for the lead surrogate because no valid characters | |
// are in that range, so doesn't actually matter whether the trail surrogate is in range. | |
$unpackUnsignedShort = function(string $x) { | |
if(strlen($x) > 2) { throw new \InvalidArgumentException; } | |
$a = ord($x{0}); | |
$b = ord($x{1}); | |
return ($a * (2 ** 8)) + $b; | |
}; | |
$delimiterLeadSurrogateLow = $unpackUnsignedShort(hex2bin("D800")); // 55296 | |
$delimiterLeadSurrogateHigh = $unpackUnsignedShort(hex2bin("DBFF")); // 56319 | |
var_dump($delimiterLeadSurrogateLow, $delimiterLeadSurrogateHigh); | |
// Now we have the range between 0 - 65536 (2 byte short) within which we can determine | |
// whether a codepoint is encoded using a surrogate pair. If the 2-byte "code unit" lies between | |
// 55296 - 56319 (inclusive), then it's a surrogate pair and we'll need to increment the indices for Javascript. | |
$urlStart_UCS_2 = $urlStart; | |
$urlEnd_UCS_2 = $urlEnd; | |
$codePointsCounted = 0; // We must also keep track of codepoints, since that's what the original delimiters are in | |
// Scanning from the start: | |
for($i = 0; $i < (strlen($body2) / 2); $i += 2) | |
{ | |
// Now we are scanning the UTF-16 indexed body in increments of 2 bytes. | |
$a = $body2{$i}; | |
$b = $body2{$i + 1}; | |
$value = $unpackUnsignedShort($a.$b); | |
if($value >= $delimiterLeadSurrogateLow and $value <= $delimiterLeadSurrogateHigh) { | |
// We have detected a surrogate pair! | |
if($codePointsCounted < $urlStart) { // Encountered before the start index | |
$urlStart_UCS_2 += 1; | |
$urlEnd_UCS_2 += 1; // Of course, if we increase the start, the length must increase as well | |
} else { // Encountered from the start index | |
$urlEnd_UCS_2 += 1; | |
} | |
$i += 2; // We now skip the trail surrogate. | |
} | |
$codePointsCounted += 1; | |
} | |
// We can see that the indices have been corrected. | |
var_dump($urlStart_UCS_2, $urlEnd_UCS_2); | |
// Here we go! | |
$urlStart = $urlStart_UCS_2; | |
$urlEnd = $urlEnd_UCS_2; | |
// If you want to verify that this technique works, try adding more emojis to the original string and | |
// change the indices accordingly! | |
?> | |
<!DOCTYPE html> | |
Original string: <span id="output1"></span> | |
<br> | |
Corrected indices working correctly: <span id="output2"></span> | |
<script> | |
var response = {"id":31,"body":"<?= $body ?>","created_at":"2017-02-14 20:17:58","comments_count":0,"likes_count":0,"entities":{"user_mentions":[{"screen_name":"ummjackson","indices":[0,11]}],"hashtags":[],"urls":[{"url":"https://i.imgur.com/I32CQ81.jpg","indices":[<?= $urlStart ?>,<?= $urlEnd ?>]}]},"user":{"username":"ummjackson","name":"Jackson Palmer","bio":"Marketer and data geek at Adobe by day. Coder and producer by night.","website":"http://ummjackson.com","location":"San Francisco","color":"333333","avatar_url":"https://www.gravatar.com/avatar/88a577ccfb0f22f75dbb54b72d92fea7.jpg?s=200&d=mm"}}; | |
document.getElementById('output1').innerHTML = body = response.body.toString(); | |
var start = response.entities.urls[0].indices[0]; | |
var end = response.entities.urls[0].indices[1]; | |
document.getElementById('output2').innerHTML = body.slice(start, end /* exclusive */); | |
</script> | |
<!-- | |
Note: You should provide the client with indices in both: Unicode codepoints, and the indices in UCS-2 (2-bytes). | |
That way, it's up to the client whether it can use a proper encoding, or not. | |
Another thing is that you shouldn't mess with database encodings to satisfy Javascript - I recommend | |
that you ALWAYS use UTF-8 for ALL persistence. You REALLY shouldn't use any other encoding than UTF-8. | |
For explanation: http://utf8everywhere.org/ | |
Also: You can of course do the same processing on the client side. But if possible: why use up precious browser time, | |
when you can pre-compute the values once and serve it to everyone? Especially for long strings. Maybe have database | |
columns for both unicode codepoint indices and UCS-2 indices? | |
Copyright 2017 Michael Yoo <michael@yoo.id.au> Released under MIT or Apache-2.0 or BSD-3 Clause, at your discretion. | |
--> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Could be a bug at line 73, scanning only half of the string (strlen/2 and $i += 2, duplicates?) @ummjackson