Skip to content

Instantly share code, notes, and snippets.

@GrabYourPitchforks
Created March 28, 2019 23:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save GrabYourPitchforks/e07d779c95633ec4e4e28e4fd1fbd3cc to your computer and use it in GitHub Desktop.
Save GrabYourPitchforks/e07d779c95633ec4e4e28e4fd1fbd3cc to your computer and use it in GitHub Desktop.
// In a loop, try reading a natural word at a time.
const int CharsPerNuint = sizeof(nuint) / sizeof(char);
for (; inputLength >= CharsPerNuint; pInputBuffer += CharsPerNuint, inputLength -= CharsPerNuint)
{
nuint utf16Data = Unsafe.ReadUnaligned<nuint>(pInputBuffer);
utf16Data &= unchecked((nuint)0xFF80_FF80_FF80_FF80ul);
if (utf16Data == 0)
{
continue; // found all-ASCII data; keep going!
}
utf16Data >>= 7; // each word goes from [ xxxxxxxx x0000000 ] to [ 0000000x xxxxxxxx ]
// Non-ASCII data incoming. First, do a very quick check for surrogates.
// If we see even a single surrogate char we'll fall back to char-by-char
// stripping of the input data.
nuint charsEqualToOrOverD800 = utf16Data + unchecked((nuint)0x7E50_7E50_7E50_7E50ul); // 0x8000 bit indicates >= 0xD800
nuint charsEqualToOrOverE000 = utf16Data + unchecked((nuint)0x7E40_7E40_7E40_7E40ul); // 0x8000 bit indicates >= 0xE000
if (((charsEqualToOrOverD800 ^ charsEqualToOrOverE000) & unchecked((nuint)0x8000_8000_8000_8000ul)) == 0)
{
// No surrogates found here. Use the popcnt mechanism from earlier to
// track the number of 2-byte and 3-byte chars seen. We don't need
// to worry about endianness issues here because the input words
// were already in machine-endian order.
nuint twoOrMoreUtf8BytesMask = utf16Data + unchecked((nuint)0x3FFF_3FFF_3FFF_3FFFul); // 0x4000 bit indicates >= 0x0080
nuint threeOrMoreUtf8BytesMask = utf16Data + unchecked((nuint)0x7FF0_7FF0_7FF0_7FF0ul); // 0x8000 bit indicates >= 0x0800
nuint combinedMask = (twoOrMoreUtf8BytesMask & unchecked((nuint)0x4000_4000_4000_4000ul))
| (threeOrMoreUtf8BytesMask & unchecked((nuint)0x8000_8000_8000_8000ul));
tempUtf8CodeUnitCountAdjustment += (uint)BitOperations.PopCount(combinedMask);
}
else
{
// Surrogates found. Strip off chars one-by-one.
for (int i = CharsPerNuint - 1; i >= 0; i--)
{
uint thisChar; // remember: it was previously shifted right by 7, so [ 0000000x xxxxxxxx ]
if (BitConverter.IsLittleEndian)
{
thisChar = (ushort)utf16Data;
utf16Data >>= 16;
}
else
{
utf16Data = BitOperations.RotateLeft(utf16Data, 16);
thisChar = (ushort)utf16Data;
}
tempUtf8CodeUnitCountAdjustment += (thisChar + 0x01FF) >> 10; // 1 if 0080..FFFF, else 0
tempUtf8CodeUnitCountAdjustment += (thisChar + 0x01F0) >> 10; // 1 if 0800..FFFF, else 0
if (!IsInRangeInclusive(thisChar, 0xD800u >> 7, 0xDFFF >> 7))
{
continue; // not a surrogate - keep going!
}
}
}
if ((charsEqualToOrOverD800 & unchecked((nuint)0x8000_8000_8000_8000ul))
!= (charsEqualToOrOverE000 & unchecked((nuint)0x8000_8000_8000_8000ul)))
{
}
nuint surrogatesCheck = ((utf16Data >> 11) - unchecked((nuint)0x001B_001B_001B_001Bul)) & unchecked((nuint)0x001F_001F_001F_001Ful);
// Non-ASCII data incoming.
uint i = unchecked((uint)0xFF80_FF80_FF80_FF80u);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment