Created
March 28, 2019 23:11
-
-
Save GrabYourPitchforks/e07d779c95633ec4e4e28e4fd1fbd3cc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// In a loop, try reading a natural word at a time. | |
const int CharsPerNuint = sizeof(nuint) / sizeof(char); | |
for (; inputLength >= CharsPerNuint; pInputBuffer += CharsPerNuint, inputLength -= CharsPerNuint) | |
{ | |
nuint utf16Data = Unsafe.ReadUnaligned<nuint>(pInputBuffer); | |
utf16Data &= unchecked((nuint)0xFF80_FF80_FF80_FF80ul); | |
if (utf16Data == 0) | |
{ | |
continue; // found all-ASCII data; keep going! | |
} | |
utf16Data >>= 7; // each word goes from [ xxxxxxxx x0000000 ] to [ 0000000x xxxxxxxx ] | |
// Non-ASCII data incoming. First, do a very quick check for surrogates. | |
// If we see even a single surrogate char we'll fall back to char-by-char | |
// stripping of the input data. | |
nuint charsEqualToOrOverD800 = utf16Data + unchecked((nuint)0x7E50_7E50_7E50_7E50ul); // 0x8000 bit indicates >= 0xD800 | |
nuint charsEqualToOrOverE000 = utf16Data + unchecked((nuint)0x7E40_7E40_7E40_7E40ul); // 0x8000 bit indicates >= 0xE000 | |
if (((charsEqualToOrOverD800 ^ charsEqualToOrOverE000) & unchecked((nuint)0x8000_8000_8000_8000ul)) == 0) | |
{ | |
// No surrogates found here. Use the popcnt mechanism from earlier to | |
// track the number of 2-byte and 3-byte chars seen. We don't need | |
// to worry about endianness issues here because the input words | |
// were already in machine-endian order. | |
nuint twoOrMoreUtf8BytesMask = utf16Data + unchecked((nuint)0x3FFF_3FFF_3FFF_3FFFul); // 0x4000 bit indicates >= 0x0080 | |
nuint threeOrMoreUtf8BytesMask = utf16Data + unchecked((nuint)0x7FF0_7FF0_7FF0_7FF0ul); // 0x8000 bit indicates >= 0x0800 | |
nuint combinedMask = (twoOrMoreUtf8BytesMask & unchecked((nuint)0x4000_4000_4000_4000ul)) | |
| (threeOrMoreUtf8BytesMask & unchecked((nuint)0x8000_8000_8000_8000ul)); | |
tempUtf8CodeUnitCountAdjustment += (uint)BitOperations.PopCount(combinedMask); | |
} | |
else | |
{ | |
// Surrogates found. Strip off chars one-by-one. | |
for (int i = CharsPerNuint - 1; i >= 0; i--) | |
{ | |
uint thisChar; // remember: it was previously shifted right by 7, so [ 0000000x xxxxxxxx ] | |
if (BitConverter.IsLittleEndian) | |
{ | |
thisChar = (ushort)utf16Data; | |
utf16Data >>= 16; | |
} | |
else | |
{ | |
utf16Data = BitOperations.RotateLeft(utf16Data, 16); | |
thisChar = (ushort)utf16Data; | |
} | |
tempUtf8CodeUnitCountAdjustment += (thisChar + 0x01FF) >> 10; // 1 if 0080..FFFF, else 0 | |
tempUtf8CodeUnitCountAdjustment += (thisChar + 0x01F0) >> 10; // 1 if 0800..FFFF, else 0 | |
if (!IsInRangeInclusive(thisChar, 0xD800u >> 7, 0xDFFF >> 7)) | |
{ | |
continue; // not a surrogate - keep going! | |
} | |
} | |
} | |
if ((charsEqualToOrOverD800 & unchecked((nuint)0x8000_8000_8000_8000ul)) | |
!= (charsEqualToOrOverE000 & unchecked((nuint)0x8000_8000_8000_8000ul))) | |
{ | |
} | |
nuint surrogatesCheck = ((utf16Data >> 11) - unchecked((nuint)0x001B_001B_001B_001Bul)) & unchecked((nuint)0x001F_001F_001F_001Ful); | |
// Non-ASCII data incoming. | |
uint i = unchecked((uint)0xFF80_FF80_FF80_FF80u); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment