lemire/sse.cs

## sse.cs
  public unsafe static byte* GetPointerToFirstInvalidByteSse(byte* pInputBuffer, int inputLength)
        {

            int processedLength = 0;

            if (pInputBuffer == null || inputLength <= 0)
            {
                return pInputBuffer;
            }
            if (inputLength > 128)
            {
                // We skip any ASCII characters at the start of the buffer
                int asciirun = 0;
                for(; asciirun + 64 <= inputLength; asciirun += 64)
                {
                    Vector128<byte> block1 = Avx.LoadVector128(pInputBuffer + asciirun);
                    Vector128<byte> block2 = Avx.LoadVector128(pInputBuffer + asciirun + 16);
                    Vector128<byte> block3 = Avx.LoadVector128(pInputBuffer + asciirun + 32);
                    Vector128<byte> block4 = Avx.LoadVector128(pInputBuffer + asciirun + 48);

                    Vector128<byte> or = Sse2.Or(Sse2.Or(block1, block2), Sse2.Or(block3, block4));
                    if (Sse2.MoveMask(or) != 0)
                    {
                        break;
                    }
                }
                processedLength = asciirun;

                if (processedLength + 16 < inputLength)
                {
                    // We still have work to do!
                    Vector128<byte> prevInputBlock = Vector128<byte>.Zero;

                    Vector128<byte> maxValue = Vector128.Create(
                            255, 255, 255, 255, 255, 255, 255, 255,
                            255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1);
                    Vector128<byte> prevIncomplete = Sse2.SubtractSaturate(prevInputBlock, maxValue);


                    Vector128<byte> shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
                            TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
                            TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
                            TOO_SHORT | OVERLONG_2,
                            TOO_SHORT,
                            TOO_SHORT | OVERLONG_3 | SURROGATE,
                            TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);

                    Vector128<byte> shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
                            CARRY | OVERLONG_2,
                            CARRY,
                            CARRY,
                            CARRY | TOO_LARGE,
                            CARRY | TOO_LARGE | TOO_LARGE_1000,
                            CARRY | TOO_LARGE | TOO_LARGE_1000,
                            CARRY | TOO_LARGE | TOO_LARGE_1000,
                            CARRY | TOO_LARGE | TOO_LARGE_1000,
                            CARRY | TOO_LARGE | TOO_LARGE_1000,
                            CARRY | TOO_LARGE | TOO_LARGE_1000,
                            CARRY | TOO_LARGE | TOO_LARGE_1000,
                            CARRY | TOO_LARGE | TOO_LARGE_1000,
                            CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
                            CARRY | TOO_LARGE | TOO_LARGE_1000,
                            CARRY | TOO_LARGE | TOO_LARGE_1000);
                    Vector128<byte> shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
                            TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
                            TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
                            TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
                            TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
                            TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
                            TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);

                    Vector128<byte> thirdByte = Vector128.Create((byte)(0b11100000u - 0x80));
                    Vector128<byte> fourthByte = Vector128.Create((byte)(0b11110000u - 0x80));
                    Vector128<byte> v0f = Vector128.Create((byte)0x0F);
                    Vector128<byte> v80 = Vector128.Create((byte)0x80);

                    for (; processedLength + 16 <= inputLength; processedLength += 16)
                    {

                        Vector128<byte> currentBlock = Sse2.LoadVector128(pInputBuffer + processedLength);

                        int mask = Sse2.MoveMask(currentBlock);
                        if (mask == 0)
                        {
                                // Console.WriteLine("ascii");

                            // We have an ASCII block, no need to process it, but
                            // we need to check if the previous block was incomplete.
                            if (Sse2.MoveMask(prevIncomplete) != 0)
                            {
                               // return pInputBuffer + processedLength;

                              //  Console.WriteLine("not ascii");
                               return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
                            }
                            prevIncomplete = Vector128<byte>.Zero;
                        }
                        else
                        {
                            // Contains non-ASCII characters, we need to do non-trivial processing
                            Vector128<byte> prev1 = Ssse3.AlignRight(currentBlock, prevInputBlock, (byte)(16 - 1));
                            Vector128<byte> byte_1_high = Ssse3.Shuffle(shuf1, Sse2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
                            Vector128<byte> byte_1_low = Ssse3.Shuffle(shuf2, (prev1 & v0f));
                            Vector128<byte> byte_2_high = Ssse3.Shuffle(shuf3, Sse2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
                            Vector128<byte> sc = Sse2.And(Sse2.And(byte_1_high, byte_1_low), byte_2_high);
                            Vector128<byte> prev2 = Ssse3.AlignRight (currentBlock, prevInputBlock, (byte)(16 - 2));
                            Vector128<byte> prev3 = Ssse3.AlignRight (currentBlock, prevInputBlock, (byte)(16 - 3));
                            prevInputBlock = currentBlock;
                            Vector128<byte> isThirdByte = Sse2.SubtractSaturate(prev2, thirdByte);
                            Vector128<byte> isFourthByte = Sse2.SubtractSaturate(prev3, fourthByte);
                            Vector128<byte> must23 = Sse2.Or(isThirdByte, isFourthByte);
                            Vector128<byte> must23As80 = Sse2.And(must23, v80);
                            Vector128<byte> error = Sse2.Xor(must23As80, sc);
                            if (Sse2.MoveMask(error) != 0)
                            {
                                return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
                            }
                            prevIncomplete = Sse2.SubtractSaturate(currentBlock, maxValue);
                        }
                    }
                }
            }
            // We have processed all the blocks using SIMD, we need to process the remaining bytes.

            // Process the remaining bytes with the scalar function
            if (processedLength < inputLength)
            {
                // We need to possibly backtrack to the start of the last code point
                // worst possible case is 4 bytes, where we need to backtrack 3 bytes
                // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
                if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
                {
                    processedLength -= 1;
                    if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
                    {
                        processedLength -= 1;
                        if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
                        {
                            processedLength -= 1;
                        }
                    }
                }
                byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength);
                if (invalidBytePointer != pInputBuffer + inputLength)
                {
                    // An invalid byte was found by the scalar function
                    return invalidBytePointer;
                }
            }

            return pInputBuffer + inputLength;
        }
	public unsafe static byte* GetPointerToFirstInvalidByteSse(byte* pInputBuffer, int inputLength)
	{

	int processedLength = 0;

	if (pInputBuffer == null \|\| inputLength <= 0)
	{
	return pInputBuffer;
	}
	if (inputLength > 128)
	{
	// We skip any ASCII characters at the start of the buffer
	int asciirun = 0;
	for(; asciirun + 64 <= inputLength; asciirun += 64)
	{
	Vector128<byte> block1 = Avx.LoadVector128(pInputBuffer + asciirun);
	Vector128<byte> block2 = Avx.LoadVector128(pInputBuffer + asciirun + 16);
	Vector128<byte> block3 = Avx.LoadVector128(pInputBuffer + asciirun + 32);
	Vector128<byte> block4 = Avx.LoadVector128(pInputBuffer + asciirun + 48);

	Vector128<byte> or = Sse2.Or(Sse2.Or(block1, block2), Sse2.Or(block3, block4));
	if (Sse2.MoveMask(or) != 0)
	{
	break;
	}
	}
	processedLength = asciirun;

	if (processedLength + 16 < inputLength)
	{
	// We still have work to do!
	Vector128<byte> prevInputBlock = Vector128<byte>.Zero;

	Vector128<byte> maxValue = Vector128.Create(
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1);
	Vector128<byte> prevIncomplete = Sse2.SubtractSaturate(prevInputBlock, maxValue);


	Vector128<byte> shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
	TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
	TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
	TOO_SHORT \| OVERLONG_2,
	TOO_SHORT,
	TOO_SHORT \| OVERLONG_3 \| SURROGATE,
	TOO_SHORT \| TOO_LARGE \| TOO_LARGE_1000 \| OVERLONG_4);

	Vector128<byte> shuf2 = Vector128.Create(CARRY \| OVERLONG_3 \| OVERLONG_2 \| OVERLONG_4,
	CARRY \| OVERLONG_2,
	CARRY,
	CARRY,
	CARRY \| TOO_LARGE,
	CARRY \| TOO_LARGE \| TOO_LARGE_1000,
	CARRY \| TOO_LARGE \| TOO_LARGE_1000,
	CARRY \| TOO_LARGE \| TOO_LARGE_1000,
	CARRY \| TOO_LARGE \| TOO_LARGE_1000,
	CARRY \| TOO_LARGE \| TOO_LARGE_1000,
	CARRY \| TOO_LARGE \| TOO_LARGE_1000,
	CARRY \| TOO_LARGE \| TOO_LARGE_1000,
	CARRY \| TOO_LARGE \| TOO_LARGE_1000,
	CARRY \| TOO_LARGE \| TOO_LARGE_1000 \| SURROGATE,
	CARRY \| TOO_LARGE \| TOO_LARGE_1000,
	CARRY \| TOO_LARGE \| TOO_LARGE_1000);
	Vector128<byte> shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
	TOO_LONG \| OVERLONG_2 \| TWO_CONTS \| OVERLONG_3 \| TOO_LARGE_1000 \| OVERLONG_4,
	TOO_LONG \| OVERLONG_2 \| TWO_CONTS \| OVERLONG_3 \| TOO_LARGE,
	TOO_LONG \| OVERLONG_2 \| TWO_CONTS \| SURROGATE \| TOO_LARGE,
	TOO_LONG \| OVERLONG_2 \| TWO_CONTS \| SURROGATE \| TOO_LARGE,
	TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);

	Vector128<byte> thirdByte = Vector128.Create((byte)(0b11100000u - 0x80));
	Vector128<byte> fourthByte = Vector128.Create((byte)(0b11110000u - 0x80));
	Vector128<byte> v0f = Vector128.Create((byte)0x0F);
	Vector128<byte> v80 = Vector128.Create((byte)0x80);

	for (; processedLength + 16 <= inputLength; processedLength += 16)
	{

	Vector128<byte> currentBlock = Sse2.LoadVector128(pInputBuffer + processedLength);

	int mask = Sse2.MoveMask(currentBlock);
	if (mask == 0)
	{
	// Console.WriteLine("ascii");

	// We have an ASCII block, no need to process it, but
	// we need to check if the previous block was incomplete.
	if (Sse2.MoveMask(prevIncomplete) != 0)
	{
	// return pInputBuffer + processedLength;

	// Console.WriteLine("not ascii");
	return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
	}
	prevIncomplete = Vector128<byte>.Zero;
	}
	else
	{
	// Contains non-ASCII characters, we need to do non-trivial processing
	Vector128<byte> prev1 = Ssse3.AlignRight(currentBlock, prevInputBlock, (byte)(16 - 1));
	Vector128<byte> byte_1_high = Ssse3.Shuffle(shuf1, Sse2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
	Vector128<byte> byte_1_low = Ssse3.Shuffle(shuf2, (prev1 & v0f));
	Vector128<byte> byte_2_high = Ssse3.Shuffle(shuf3, Sse2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
	Vector128<byte> sc = Sse2.And(Sse2.And(byte_1_high, byte_1_low), byte_2_high);
	Vector128<byte> prev2 = Ssse3.AlignRight (currentBlock, prevInputBlock, (byte)(16 - 2));
	Vector128<byte> prev3 = Ssse3.AlignRight (currentBlock, prevInputBlock, (byte)(16 - 3));
	prevInputBlock = currentBlock;
	Vector128<byte> isThirdByte = Sse2.SubtractSaturate(prev2, thirdByte);
	Vector128<byte> isFourthByte = Sse2.SubtractSaturate(prev3, fourthByte);
	Vector128<byte> must23 = Sse2.Or(isThirdByte, isFourthByte);
	Vector128<byte> must23As80 = Sse2.And(must23, v80);
	Vector128<byte> error = Sse2.Xor(must23As80, sc);
	if (Sse2.MoveMask(error) != 0)
	{
	return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
	}
	prevIncomplete = Sse2.SubtractSaturate(currentBlock, maxValue);
	}
	}
	}
	}
	// We have processed all the blocks using SIMD, we need to process the remaining bytes.

	// Process the remaining bytes with the scalar function
	if (processedLength < inputLength)
	{
	// We need to possibly backtrack to the start of the last code point
	// worst possible case is 4 bytes, where we need to backtrack 3 bytes
	// 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
	if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
	{
	processedLength -= 1;
	if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
	{
	processedLength -= 1;
	if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
	{
	processedLength -= 1;
	}
	}
	}
	byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength);
	if (invalidBytePointer != pInputBuffer + inputLength)
	{
	// An invalid byte was found by the scalar function
	return invalidBytePointer;
	}
	}

	return pInputBuffer + inputLength;
	}