Skip to content

Instantly share code, notes, and snippets.

@lemire
Created April 1, 2024 13:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lemire/7fc9d553a3ec34ac26ca6f5614beb6ac to your computer and use it in GitHub Desktop.
Save lemire/7fc9d553a3ec34ac26ca6f5614beb6ac to your computer and use it in GitHub Desktop.
utf8_validation.cs
public unsafe static byte* GetPointerToFirstInvalidByteSse(byte* pInputBuffer, int inputLength)
{
int processedLength = 0;
if (pInputBuffer == null || inputLength <= 0)
{
return pInputBuffer;
}
if (inputLength > 128)
{
// We skip any ASCII characters at the start of the buffer
int asciirun = 0;
for(; asciirun + 64 <= inputLength; asciirun += 64)
{
Vector128<byte> block1 = Avx.LoadVector128(pInputBuffer + asciirun);
Vector128<byte> block2 = Avx.LoadVector128(pInputBuffer + asciirun + 16);
Vector128<byte> block3 = Avx.LoadVector128(pInputBuffer + asciirun + 32);
Vector128<byte> block4 = Avx.LoadVector128(pInputBuffer + asciirun + 48);
Vector128<byte> or = Sse2.Or(Sse2.Or(block1, block2), Sse2.Or(block3, block4));
if (Sse2.MoveMask(or) != 0)
{
break;
}
}
processedLength = asciirun;
if (processedLength + 16 < inputLength)
{
// We still have work to do!
Vector128<byte> prevInputBlock = Vector128<byte>.Zero;
Vector128<byte> maxValue = Vector128.Create(
255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1);
Vector128<byte> prevIncomplete = Sse2.SubtractSaturate(prevInputBlock, maxValue);
Vector128<byte> shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
TOO_SHORT | OVERLONG_2,
TOO_SHORT,
TOO_SHORT | OVERLONG_3 | SURROGATE,
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
Vector128<byte> shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
CARRY | OVERLONG_2,
CARRY,
CARRY,
CARRY | TOO_LARGE,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000);
Vector128<byte> shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
Vector128<byte> thirdByte = Vector128.Create((byte)(0b11100000u - 0x80));
Vector128<byte> fourthByte = Vector128.Create((byte)(0b11110000u - 0x80));
Vector128<byte> v0f = Vector128.Create((byte)0x0F);
Vector128<byte> v80 = Vector128.Create((byte)0x80);
for (; processedLength + 16 <= inputLength; processedLength += 16)
{
Vector128<byte> currentBlock = Sse2.LoadVector128(pInputBuffer + processedLength);
int mask = Sse2.MoveMask(currentBlock);
if (mask == 0)
{
// Console.WriteLine("ascii");
// We have an ASCII block, no need to process it, but
// we need to check if the previous block was incomplete.
if (Sse2.MoveMask(prevIncomplete) != 0)
{
// return pInputBuffer + processedLength;
// Console.WriteLine("not ascii");
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
}
prevIncomplete = Vector128<byte>.Zero;
}
else
{
// Contains non-ASCII characters, we need to do non-trivial processing
Vector128<byte> prev1 = Ssse3.AlignRight(currentBlock, prevInputBlock, (byte)(16 - 1));
Vector128<byte> byte_1_high = Ssse3.Shuffle(shuf1, Sse2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
Vector128<byte> byte_1_low = Ssse3.Shuffle(shuf2, (prev1 & v0f));
Vector128<byte> byte_2_high = Ssse3.Shuffle(shuf3, Sse2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
Vector128<byte> sc = Sse2.And(Sse2.And(byte_1_high, byte_1_low), byte_2_high);
Vector128<byte> prev2 = Ssse3.AlignRight (currentBlock, prevInputBlock, (byte)(16 - 2));
Vector128<byte> prev3 = Ssse3.AlignRight (currentBlock, prevInputBlock, (byte)(16 - 3));
prevInputBlock = currentBlock;
Vector128<byte> isThirdByte = Sse2.SubtractSaturate(prev2, thirdByte);
Vector128<byte> isFourthByte = Sse2.SubtractSaturate(prev3, fourthByte);
Vector128<byte> must23 = Sse2.Or(isThirdByte, isFourthByte);
Vector128<byte> must23As80 = Sse2.And(must23, v80);
Vector128<byte> error = Sse2.Xor(must23As80, sc);
if (Sse2.MoveMask(error) != 0)
{
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
}
prevIncomplete = Sse2.SubtractSaturate(currentBlock, maxValue);
}
}
}
}
// We have processed all the blocks using SIMD, we need to process the remaining bytes.
// Process the remaining bytes with the scalar function
if (processedLength < inputLength)
{
// We need to possibly backtrack to the start of the last code point
// worst possible case is 4 bytes, where we need to backtrack 3 bytes
// 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
{
processedLength -= 1;
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
{
processedLength -= 1;
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
{
processedLength -= 1;
}
}
}
byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength);
if (invalidBytePointer != pInputBuffer + inputLength)
{
// An invalid byte was found by the scalar function
return invalidBytePointer;
}
}
return pInputBuffer + inputLength;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment