Skip to content

Instantly share code, notes, and snippets.

@lemire
Created March 19, 2024 15:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lemire/bb56fa88129b4c3e84c45271f51acce2 to your computer and use it in GitHub Desktop.
Save lemire/bb56fa88129b4c3e84c45271f51acce2 to your computer and use it in GitHub Desktop.
core utf-8 validation algorithm in C#
Vector128<byte> shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
TOO_SHORT | OVERLONG_2,
TOO_SHORT,
TOO_SHORT | OVERLONG_3 | SURROGATE,
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
Vector128<byte> shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
CARRY | OVERLONG_2,
CARRY,
CARRY,
CARRY | TOO_LARGE,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000);
Vector128<byte> shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
Vector128<byte> thirdByte = Vector128.Create((byte)(0b11100000u - 0x80));
Vector128<byte> fourthByte = Vector128.Create((byte)(0b11110000u - 0x80));
Vector128<byte> v0f = Vector128.Create((byte)0x0F);
Vector128<byte> v80 = Vector128.Create((byte)0x80);
// Performance note: we could process 64 bytes at a time for better speed in some cases.
for (; processedLength + 16 <= inputLength; processedLength += 16)
{
Vector128<byte> currentBlock = AdvSimd.LoadVector128(pInputBuffer + processedLength);
if (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() > 127)
{
// We have an ASCII block, no need to process it, but
// we need to check if the previous block was incomplete.
if (AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0)
{
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
}
prevIncomplete = Vector128<byte>.Zero;
}
else
{
// Contains non-ASCII characters, we need to do non-trivial processing
//vextq
Vector128<byte> prev1 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 1));
Vector128<byte> byte_1_high = Vector128.Shuffle(shuf1, AdvSimd.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
Vector128<byte> byte_1_low = Vector128.Shuffle(shuf2, (prev1 & v0f));
Vector128<byte> byte_2_high = Vector128.Shuffle(shuf3, AdvSimd.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
Vector128<byte> sc = AdvSimd.And(AdvSimd.And(byte_1_high, byte_1_low), byte_2_high);
Vector128<byte> prev2 = AdvSimd.ExtractVector128 (prevInputBlock, currentBlock, (byte)(16 - 2));
Vector128<byte> prev3 = AdvSimd.ExtractVector128 (prevInputBlock, currentBlock, (byte)(16 - 3));
prevInputBlock = currentBlock;
Vector128<byte> isThirdByte = AdvSimd.SubtractSaturate(prev2, thirdByte);
Vector128<byte> isFourthByte = AdvSimd.SubtractSaturate(prev3, fourthByte);
Vector128<byte> must23 = AdvSimd.Or(isThirdByte, isFourthByte);
Vector128<byte> must23As80 = AdvSimd.And(must23, v80);
Vector128<byte> error = AdvSimd.Xor(must23As80, sc);
if (AdvSimd.Arm64.MaxAcross(error).ToScalar() != 0)
{
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
}
prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment