Created
April 1, 2024 13:31
-
-
Save lemire/7fc9d553a3ec34ac26ca6f5614beb6ac to your computer and use it in GitHub Desktop.
utf8_validation.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public unsafe static byte* GetPointerToFirstInvalidByteSse(byte* pInputBuffer, int inputLength) | |
{ | |
int processedLength = 0; | |
if (pInputBuffer == null || inputLength <= 0) | |
{ | |
return pInputBuffer; | |
} | |
if (inputLength > 128) | |
{ | |
// We skip any ASCII characters at the start of the buffer | |
int asciirun = 0; | |
for(; asciirun + 64 <= inputLength; asciirun += 64) | |
{ | |
Vector128<byte> block1 = Avx.LoadVector128(pInputBuffer + asciirun); | |
Vector128<byte> block2 = Avx.LoadVector128(pInputBuffer + asciirun + 16); | |
Vector128<byte> block3 = Avx.LoadVector128(pInputBuffer + asciirun + 32); | |
Vector128<byte> block4 = Avx.LoadVector128(pInputBuffer + asciirun + 48); | |
Vector128<byte> or = Sse2.Or(Sse2.Or(block1, block2), Sse2.Or(block3, block4)); | |
if (Sse2.MoveMask(or) != 0) | |
{ | |
break; | |
} | |
} | |
processedLength = asciirun; | |
if (processedLength + 16 < inputLength) | |
{ | |
// We still have work to do! | |
Vector128<byte> prevInputBlock = Vector128<byte>.Zero; | |
Vector128<byte> maxValue = Vector128.Create( | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); | |
Vector128<byte> prevIncomplete = Sse2.SubtractSaturate(prevInputBlock, maxValue); | |
Vector128<byte> shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, | |
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, | |
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, | |
TOO_SHORT | OVERLONG_2, | |
TOO_SHORT, | |
TOO_SHORT | OVERLONG_3 | SURROGATE, | |
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); | |
Vector128<byte> shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, | |
CARRY | OVERLONG_2, | |
CARRY, | |
CARRY, | |
CARRY | TOO_LARGE, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, | |
CARRY | TOO_LARGE | TOO_LARGE_1000, | |
CARRY | TOO_LARGE | TOO_LARGE_1000); | |
Vector128<byte> shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, | |
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, | |
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, | |
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, | |
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, | |
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, | |
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); | |
Vector128<byte> thirdByte = Vector128.Create((byte)(0b11100000u - 0x80)); | |
Vector128<byte> fourthByte = Vector128.Create((byte)(0b11110000u - 0x80)); | |
Vector128<byte> v0f = Vector128.Create((byte)0x0F); | |
Vector128<byte> v80 = Vector128.Create((byte)0x80); | |
for (; processedLength + 16 <= inputLength; processedLength += 16) | |
{ | |
Vector128<byte> currentBlock = Sse2.LoadVector128(pInputBuffer + processedLength); | |
int mask = Sse2.MoveMask(currentBlock); | |
if (mask == 0) | |
{ | |
// Console.WriteLine("ascii"); | |
// We have an ASCII block, no need to process it, but | |
// we need to check if the previous block was incomplete. | |
if (Sse2.MoveMask(prevIncomplete) != 0) | |
{ | |
// return pInputBuffer + processedLength; | |
// Console.WriteLine("not ascii"); | |
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); | |
} | |
prevIncomplete = Vector128<byte>.Zero; | |
} | |
else | |
{ | |
// Contains non-ASCII characters, we need to do non-trivial processing | |
Vector128<byte> prev1 = Ssse3.AlignRight(currentBlock, prevInputBlock, (byte)(16 - 1)); | |
Vector128<byte> byte_1_high = Ssse3.Shuffle(shuf1, Sse2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f); | |
Vector128<byte> byte_1_low = Ssse3.Shuffle(shuf2, (prev1 & v0f)); | |
Vector128<byte> byte_2_high = Ssse3.Shuffle(shuf3, Sse2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); | |
Vector128<byte> sc = Sse2.And(Sse2.And(byte_1_high, byte_1_low), byte_2_high); | |
Vector128<byte> prev2 = Ssse3.AlignRight (currentBlock, prevInputBlock, (byte)(16 - 2)); | |
Vector128<byte> prev3 = Ssse3.AlignRight (currentBlock, prevInputBlock, (byte)(16 - 3)); | |
prevInputBlock = currentBlock; | |
Vector128<byte> isThirdByte = Sse2.SubtractSaturate(prev2, thirdByte); | |
Vector128<byte> isFourthByte = Sse2.SubtractSaturate(prev3, fourthByte); | |
Vector128<byte> must23 = Sse2.Or(isThirdByte, isFourthByte); | |
Vector128<byte> must23As80 = Sse2.And(must23, v80); | |
Vector128<byte> error = Sse2.Xor(must23As80, sc); | |
if (Sse2.MoveMask(error) != 0) | |
{ | |
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); | |
} | |
prevIncomplete = Sse2.SubtractSaturate(currentBlock, maxValue); | |
} | |
} | |
} | |
} | |
// We have processed all the blocks using SIMD, we need to process the remaining bytes. | |
// Process the remaining bytes with the scalar function | |
if (processedLength < inputLength) | |
{ | |
// We need to possibly backtrack to the start of the last code point | |
// worst possible case is 4 bytes, where we need to backtrack 3 bytes | |
// 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte | |
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) | |
{ | |
processedLength -= 1; | |
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) | |
{ | |
processedLength -= 1; | |
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) | |
{ | |
processedLength -= 1; | |
} | |
} | |
} | |
byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength); | |
if (invalidBytePointer != pInputBuffer + inputLength) | |
{ | |
// An invalid byte was found by the scalar function | |
return invalidBytePointer; | |
} | |
} | |
return pInputBuffer + inputLength; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment