GrabYourPitchforks/nonvector_utf16validation.cs

## nonvector_utf16validation.cs
    // In a loop, try reading a natural word at a time.

            const int CharsPerNuint = sizeof(nuint) / sizeof(char);
            for (; inputLength >= CharsPerNuint; pInputBuffer += CharsPerNuint, inputLength -= CharsPerNuint)
            {
                nuint utf16Data = Unsafe.ReadUnaligned<nuint>(pInputBuffer);

                utf16Data &= unchecked((nuint)0xFF80_FF80_FF80_FF80ul);
                if (utf16Data == 0)
                {
                    continue; // found all-ASCII data; keep going!
                }

                utf16Data >>= 7; // each word goes from [ xxxxxxxx x0000000 ] to [ 0000000x xxxxxxxx ]

                // Non-ASCII data incoming. First, do a very quick check for surrogates.
                // If we see even a single surrogate char we'll fall back to char-by-char
                // stripping of the input data.

                nuint charsEqualToOrOverD800 = utf16Data + unchecked((nuint)0x7E50_7E50_7E50_7E50ul); // 0x8000 bit indicates >= 0xD800
                nuint charsEqualToOrOverE000 = utf16Data + unchecked((nuint)0x7E40_7E40_7E40_7E40ul); // 0x8000 bit indicates >= 0xE000

                if (((charsEqualToOrOverD800 ^ charsEqualToOrOverE000) & unchecked((nuint)0x8000_8000_8000_8000ul)) == 0)
                {
                    // No surrogates found here. Use the popcnt mechanism from earlier to
                    // track the number of 2-byte and 3-byte chars seen. We don't need
                    // to worry about endianness issues here because the input words
                    // were already in machine-endian order.

                    nuint twoOrMoreUtf8BytesMask = utf16Data + unchecked((nuint)0x3FFF_3FFF_3FFF_3FFFul); // 0x4000 bit indicates >= 0x0080
                    nuint threeOrMoreUtf8BytesMask = utf16Data + unchecked((nuint)0x7FF0_7FF0_7FF0_7FF0ul); // 0x8000 bit indicates >= 0x0800

                    nuint combinedMask = (twoOrMoreUtf8BytesMask & unchecked((nuint)0x4000_4000_4000_4000ul))
                        | (threeOrMoreUtf8BytesMask & unchecked((nuint)0x8000_8000_8000_8000ul));

                    tempUtf8CodeUnitCountAdjustment += (uint)BitOperations.PopCount(combinedMask);
                }
                else
                {
                    // Surrogates found. Strip off chars one-by-one.

                    for (int i = CharsPerNuint - 1; i >= 0; i--)
                    {
                        uint thisChar; // remember: it was previously shifted right by 7, so [ 0000000x xxxxxxxx ]

                        if (BitConverter.IsLittleEndian)
                        {
                            thisChar = (ushort)utf16Data;
                            utf16Data >>= 16;
                        }
                        else
                        {
                            utf16Data = BitOperations.RotateLeft(utf16Data, 16);
                            thisChar = (ushort)utf16Data;
                        }

                        tempUtf8CodeUnitCountAdjustment += (thisChar + 0x01FF) >> 10; // 1 if 0080..FFFF, else 0
                        tempUtf8CodeUnitCountAdjustment += (thisChar + 0x01F0) >> 10; // 1 if 0800..FFFF, else 0

                        if (!IsInRangeInclusive(thisChar, 0xD800u >> 7, 0xDFFF >> 7))
                        {
                            continue; // not a surrogate - keep going!
                        }


                    }

                }

                if ((charsEqualToOrOverD800 & unchecked((nuint)0x8000_8000_8000_8000ul))
                    != (charsEqualToOrOverE000 & unchecked((nuint)0x8000_8000_8000_8000ul)))
                {

                }

                nuint surrogatesCheck = ((utf16Data >> 11) - unchecked((nuint)0x001B_001B_001B_001Bul)) & unchecked((nuint)0x001F_001F_001F_001Ful);


                // Non-ASCII data incoming.


                uint i = unchecked((uint)0xFF80_FF80_FF80_FF80u);


            }
	// In a loop, try reading a natural word at a time.

	const int CharsPerNuint = sizeof(nuint) / sizeof(char);
	for (; inputLength >= CharsPerNuint; pInputBuffer += CharsPerNuint, inputLength -= CharsPerNuint)
	{
	nuint utf16Data = Unsafe.ReadUnaligned<nuint>(pInputBuffer);

	utf16Data &= unchecked((nuint)0xFF80_FF80_FF80_FF80ul);
	if (utf16Data == 0)
	{
	continue; // found all-ASCII data; keep going!
	}

	utf16Data >>= 7; // each word goes from [ xxxxxxxx x0000000 ] to [ 0000000x xxxxxxxx ]

	// Non-ASCII data incoming. First, do a very quick check for surrogates.
	// If we see even a single surrogate char we'll fall back to char-by-char
	// stripping of the input data.

	nuint charsEqualToOrOverD800 = utf16Data + unchecked((nuint)0x7E50_7E50_7E50_7E50ul); // 0x8000 bit indicates >= 0xD800
	nuint charsEqualToOrOverE000 = utf16Data + unchecked((nuint)0x7E40_7E40_7E40_7E40ul); // 0x8000 bit indicates >= 0xE000

	if (((charsEqualToOrOverD800 ^ charsEqualToOrOverE000) & unchecked((nuint)0x8000_8000_8000_8000ul)) == 0)
	{
	// No surrogates found here. Use the popcnt mechanism from earlier to
	// track the number of 2-byte and 3-byte chars seen. We don't need
	// to worry about endianness issues here because the input words
	// were already in machine-endian order.

	nuint twoOrMoreUtf8BytesMask = utf16Data + unchecked((nuint)0x3FFF_3FFF_3FFF_3FFFul); // 0x4000 bit indicates >= 0x0080
	nuint threeOrMoreUtf8BytesMask = utf16Data + unchecked((nuint)0x7FF0_7FF0_7FF0_7FF0ul); // 0x8000 bit indicates >= 0x0800

	nuint combinedMask = (twoOrMoreUtf8BytesMask & unchecked((nuint)0x4000_4000_4000_4000ul))
	\| (threeOrMoreUtf8BytesMask & unchecked((nuint)0x8000_8000_8000_8000ul));

	tempUtf8CodeUnitCountAdjustment += (uint)BitOperations.PopCount(combinedMask);
	}
	else
	{
	// Surrogates found. Strip off chars one-by-one.

	for (int i = CharsPerNuint - 1; i >= 0; i--)
	{
	uint thisChar; // remember: it was previously shifted right by 7, so [ 0000000x xxxxxxxx ]

	if (BitConverter.IsLittleEndian)
	{
	thisChar = (ushort)utf16Data;
	utf16Data >>= 16;
	}
	else
	{
	utf16Data = BitOperations.RotateLeft(utf16Data, 16);
	thisChar = (ushort)utf16Data;
	}

	tempUtf8CodeUnitCountAdjustment += (thisChar + 0x01FF) >> 10; // 1 if 0080..FFFF, else 0
	tempUtf8CodeUnitCountAdjustment += (thisChar + 0x01F0) >> 10; // 1 if 0800..FFFF, else 0

	if (!IsInRangeInclusive(thisChar, 0xD800u >> 7, 0xDFFF >> 7))
	{
	continue; // not a surrogate - keep going!
	}


	}

	}

	if ((charsEqualToOrOverD800 & unchecked((nuint)0x8000_8000_8000_8000ul))
	!= (charsEqualToOrOverE000 & unchecked((nuint)0x8000_8000_8000_8000ul)))
	{

	}

	nuint surrogatesCheck = ((utf16Data >> 11) - unchecked((nuint)0x001B_001B_001B_001Bul)) & unchecked((nuint)0x001F_001F_001F_001Ful);


	// Non-ASCII data incoming.




	uint i = unchecked((uint)0xFF80_FF80_FF80_FF80u);



	}