jkeiser/utf8lookuparm.cpp

## utf8lookuparm.cpp
#include <stdio.h>
#include <stdint.h>
#include <arm_neon.h>

//
// Detect Unicode errors.
//
// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
// are straight up concatenated into the final value. The first byte of a multibyte character is a
// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
// start with 0, because that's what ASCII looks like. Here's what each size
//
// - ASCII (7 bits):              0_______
// - 2 byte character (11 bits):  110_____ 10______
// - 3 byte character (17 bits):  1110____ 10______ 10______
// - 4 byte character (23 bits):  11110___ 10______ 10______ 10______
// - 5+ byte character (illegal): 11111___ <illegal>
//
// There are 5 classes of error that can happen in Unicode:
//
// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
//   We detect this by looking for new characters (lead bytes) inside the range of a multibyte
//   character.
//
//   e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
//
// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
//   We detect this by requiring that the next byte after your multibyte character be a new
//   character--so a continuation after your character is wrong.
//
//   e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte)
//
// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
//
//   e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
//
// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
//   used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
//   technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
//
//   e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
//
// - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and
//   WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
//
//   e.g. 11101101 10100000 10000000 (U+D800)
//
// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
//   support values with more than 23 bits (which a 4-byte character supports).
//
//   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
//
// Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
//
//   Code Points        1st       2s       3s       4s
//  U+0000..U+007F     00..7F
//  U+0080..U+07FF     C2..DF   80..BF
//  U+0800..U+0FFF     E0       A0..BF   80..BF
//  U+1000..U+CFFF     E1..EC   80..BF   80..BF
//  U+D000..U+D7FF     ED       80..9F   80..BF
//  U+E000..U+FFFF     EE..EF   80..BF   80..BF
//  U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
//  U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
//  U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
//

bool check_utf8(const char* buf, size_t len) {
  uint8x16_t error;
  uint8x16_t prev_input;
  uint8x16_t prev_incomplete;
  for (size_t i=0; i<len; i+=sizeof(uint8x16_t)) {
    uint8x16_t input = vmovq_n_u8(&buf[i]);

    //
    // Check ASCII
    //
    if (vmaxvq_u8(input) < 0b10000000u) {
        error = vorrq_u8(error, prev_incomplete);
        continue;
    }

    //
    // Match the "special case" exceptions like surrogates and overlong encodings using a trio
    // of table lookups &'d together.
    //
    uint8x16_t prev1 = vextq_s8(prev_input, input, 16 - 1);

    // These are the errors we're going to match for bytes 1-2, by looking at the first three
    // nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
    static const int OVERLONG_2  = 0x01; // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
    static const int OVERLONG_3  = 0x02; // 11100000 100_____ ________
    static const int OVERLONG_4  = 0x04; // 11110000 1000____ ________ ________
    static const int SURROGATE   = 0x08; // 11101101 [101_]____
    static const int TOO_LARGE   = 0x10; // 11110100 (1001|101_)____
    static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______

    // After processing the rest of byte 1 (the low bits), we're still not done--we have to check
    // byte 2 to be sure which things are errors and which aren't.
    // Since high_bits is byte 5, byte 2 is high_bits.prev<3>
    static const int CARRY = OVERLONG_2 | TOO_LARGE_2;
    uint8x16_t byte_2_high = vqtbl1q_u8(
        uint8x16_t{
            // ASCII: ________ [0___]____
            CARRY, CARRY, CARRY, CARRY,
            // ASCII: ________ [0___]____
            CARRY, CARRY, CARRY, CARRY,
            // Continuations: ________ [10__]____
            CARRY | OVERLONG_3 | OVERLONG_4, // ________ [1000]____
            CARRY | OVERLONG_3 | TOO_LARGE,  // ________ [1001]____
            CARRY | TOO_LARGE  | SURROGATE,  // ________ [1010]____
            CARRY | TOO_LARGE  | SURROGATE,  // ________ [1011]____
            // Multibyte Leads: ________ [11__]____
            CARRY, CARRY, CARRY, CARRY
        },
        vshrq_n_u8(input, 4)
    );

    uint8x16_t byte_1_high = vqtbl1q_u8(
        uint8x16_t{
            // [0___]____ (ASCII)
            0, 0, 0, 0,
            0, 0, 0, 0,
            // [10__]____ (continuation)
            0, 0, 0, 0,
            // [11__]____ (2+-byte leads)
            OVERLONG_2, 0,                       // [110_]____ (2-byte lead)
            OVERLONG_3 | SURROGATE,              // [1110]____ (3-byte lead)
            OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]____ (4+-byte lead)
        },
        vshrq_n_u8(prev1, 4)
    );

    uint8x16_t byte_1_low = vqtlb1q_u8(
        uint8x16_t{
            // ____[00__] ________
            OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________
            OVERLONG_2,                           // ____[0001] ________
            0, 0,
            // ____[01__] ________
            TOO_LARGE,                            // ____[0100] ________
            TOO_LARGE_2,
            TOO_LARGE_2,
            TOO_LARGE_2,
            // ____[10__] ________
            TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2,
            // ____[11__] ________
            TOO_LARGE_2,
            TOO_LARGE_2 | SURROGATE,                            // ____[1101] ________
            TOO_LARGE_2, TOO_LARGE_2
        },
        vandq_u8(prev1, vdupq_n_u8(0x0F))
    )

    error = vorrq_u8(error, vandq_u8(byte_1_high, vandq_u8(byte_1_low, byte_2_high)));

    //
    // Validate that we have continuations iff we want them
    //
    uint8x16_t prev2 = vextq_s8(prev_input, input, 16 - 2);
    uint8x16_t prev3 = vextq_s8(prev_input, input, 16 - 3);

    // Cont is 10000000-101111111 (-65...-128). Use signed comparison.
    uint8x16_t is_continuation = vcltq_s8(vreinterpretq_s8_u8(input), vdupq_n_s8(-64));
    uint8x16_t is_second_byte  = vcgeq_u8(prev1, vdupq_n_u8(0b11000000u));
    uint8x16_t is_third_byte   = vcgeq_u8(prev2, vdupq_n_u8(0b11100000u));
    uint8x16_t is_fourth_byte  = vcgeq_u8(prev3, vdupq_n_u8(0b11110000u));
    // Use ^ instead of | for is_second_byte | is_third_byte | is_fourth_byte, because ^ is
    // commutative. We can do this because we only have to report errors for cases with NO lead bytes,
    // or exactly ONE, in which case ^ and | yield the same result.
    error = vorrq_u8(error,
        veorq_u8(
            veorq_u8(is_continuation, is_second_byte),
            veorq_u8(is_third_byte, is_fourth_byte)
        )
    );

    //
    // Prepare next iteration
    //

    // If there is an unfinished multibyte character, record that here (ASCII check will use it):
    prev_incomplete = vcgt_u8(input, vuint8x16_t{
      255, 255, 255, 255, 255, 255, 255, 255,
      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
    });
    prev_input = input;
  }

  // If any error bits are on, this will be > 0
  return vmaxvq_u8(error);
}
	#include <stdio.h>
	#include <stdint.h>
	#include <arm_neon.h>

	//
	// Detect Unicode errors.
	//
	// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
	// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
	// are straight up concatenated into the final value. The first byte of a multibyte character is a
	// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
	// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
	// start with 0, because that's what ASCII looks like. Here's what each size
	//
	// - ASCII (7 bits): 0_______
	// - 2 byte character (11 bits): 110_____ 10______
	// - 3 byte character (17 bits): 1110____ 10______ 10______
	// - 4 byte character (23 bits): 11110___ 10______ 10______ 10______
	// - 5+ byte character (illegal): 11111___ <illegal>
	//
	// There are 5 classes of error that can happen in Unicode:
	//
	// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
	// We detect this by looking for new characters (lead bytes) inside the range of a multibyte
	// character.
	//
	// e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
	//
	// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
	// We detect this by requiring that the next byte after your multibyte character be a new
	// character--so a continuation after your character is wrong.
	//
	// e.g. 11011111 10111111 10111111 (2-byte character followed by another continuation byte)
	//
	// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
	//
	// e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
	//
	// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
	// used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
	// technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
	//
	// e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
	//
	// - SURROGATE: Unicode U+D800-U+DFFF is a surrogate character, reserved for use in UCS-2 and
	// WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
	//
	// e.g. 11101101 10100000 10000000 (U+D800)
	//
	// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
	// support values with more than 23 bits (which a 4-byte character supports).
	//
	// e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
	//
	// Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
	//
	// Code Points 1st 2s 3s 4s
	// U+0000..U+007F 00..7F
	// U+0080..U+07FF C2..DF 80..BF
	// U+0800..U+0FFF E0 A0..BF 80..BF
	// U+1000..U+CFFF E1..EC 80..BF 80..BF
	// U+D000..U+D7FF ED 80..9F 80..BF
	// U+E000..U+FFFF EE..EF 80..BF 80..BF
	// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
	// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
	// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
	//

	bool check_utf8(const char* buf, size_t len) {
	uint8x16_t error;
	uint8x16_t prev_input;
	uint8x16_t prev_incomplete;
	for (size_t i=0; i<len; i+=sizeof(uint8x16_t)) {
	uint8x16_t input = vmovq_n_u8(&buf[i]);

	//
	// Check ASCII
	//
	if (vmaxvq_u8(input) < 0b10000000u) {
	error = vorrq_u8(error, prev_incomplete);
	continue;
	}

	//
	// Match the "special case" exceptions like surrogates and overlong encodings using a trio
	// of table lookups &'d together.
	//
	uint8x16_t prev1 = vextq_s8(prev_input, input, 16 - 1);

	// These are the errors we're going to match for bytes 1-2, by looking at the first three
	// nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
	static const int OVERLONG_2 = 0x01; // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
	static const int OVERLONG_3 = 0x02; // 11100000 100_____ ________
	static const int OVERLONG_4 = 0x04; // 11110000 1000____ ________ ________
	static const int SURROGATE = 0x08; // 11101101 [101_]____
	static const int TOO_LARGE = 0x10; // 11110100 (1001\|101_)____
	static const int TOO_LARGE_2 = 0x20; // 1111(1___\|011_\|0101) 10______

	// After processing the rest of byte 1 (the low bits), we're still not done--we have to check
	// byte 2 to be sure which things are errors and which aren't.
	// Since high_bits is byte 5, byte 2 is high_bits.prev<3>
	static const int CARRY = OVERLONG_2 \| TOO_LARGE_2;
	uint8x16_t byte_2_high = vqtbl1q_u8(
	uint8x16_t{
	// ASCII: ________ [0___]____
	CARRY, CARRY, CARRY, CARRY,
	// ASCII: ________ [0___]____
	CARRY, CARRY, CARRY, CARRY,
	// Continuations: ________ [10__]____
	CARRY \| OVERLONG_3 \| OVERLONG_4, // ________ [1000]____
	CARRY \| OVERLONG_3 \| TOO_LARGE, // ________ [1001]____
	CARRY \| TOO_LARGE \| SURROGATE, // ________ [1010]____
	CARRY \| TOO_LARGE \| SURROGATE, // ________ [1011]____
	// Multibyte Leads: ________ [11__]____
	CARRY, CARRY, CARRY, CARRY
	},
	vshrq_n_u8(input, 4)
	);

	uint8x16_t byte_1_high = vqtbl1q_u8(
	uint8x16_t{
	// [0___]____ (ASCII)
	0, 0, 0, 0,
	0, 0, 0, 0,
	// [10__]____ (continuation)
	0, 0, 0, 0,
	// [11__]____ (2+-byte leads)
	OVERLONG_2, 0, // [110_]____ (2-byte lead)
	OVERLONG_3 \| SURROGATE, // [1110]____ (3-byte lead)
	OVERLONG_4 \| TOO_LARGE \| TOO_LARGE_2 // [1111]____ (4+-byte lead)
	},
	vshrq_n_u8(prev1, 4)
	);

	uint8x16_t byte_1_low = vqtlb1q_u8(
	uint8x16_t{
	// ____[00__] ________
	OVERLONG_2 \| OVERLONG_3 \| OVERLONG_4, // ____[0000] ________
	OVERLONG_2, // ____[0001] ________
	0, 0,
	// ____[01__] ________
	TOO_LARGE, // ____[0100] ________
	TOO_LARGE_2,
	TOO_LARGE_2,
	TOO_LARGE_2,
	// ____[10__] ________
	TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2,
	// ____[11__] ________
	TOO_LARGE_2,
	TOO_LARGE_2 \| SURROGATE, // ____[1101] ________
	TOO_LARGE_2, TOO_LARGE_2
	},
	vandq_u8(prev1, vdupq_n_u8(0x0F))
	)

	error = vorrq_u8(error, vandq_u8(byte_1_high, vandq_u8(byte_1_low, byte_2_high)));

	//
	// Validate that we have continuations iff we want them
	//
	uint8x16_t prev2 = vextq_s8(prev_input, input, 16 - 2);
	uint8x16_t prev3 = vextq_s8(prev_input, input, 16 - 3);

	// Cont is 10000000-101111111 (-65...-128). Use signed comparison.
	uint8x16_t is_continuation = vcltq_s8(vreinterpretq_s8_u8(input), vdupq_n_s8(-64));
	uint8x16_t is_second_byte = vcgeq_u8(prev1, vdupq_n_u8(0b11000000u));
	uint8x16_t is_third_byte = vcgeq_u8(prev2, vdupq_n_u8(0b11100000u));
	uint8x16_t is_fourth_byte = vcgeq_u8(prev3, vdupq_n_u8(0b11110000u));
	// Use ^ instead of \| for is_second_byte \| is_third_byte \| is_fourth_byte, because ^ is
	// commutative. We can do this because we only have to report errors for cases with NO lead bytes,
	// or exactly ONE, in which case ^ and \| yield the same result.
	error = vorrq_u8(error,
	veorq_u8(
	veorq_u8(is_continuation, is_second_byte),
	veorq_u8(is_third_byte, is_fourth_byte)
	)
	);

	//
	// Prepare next iteration
	//

	// If there is an unfinished multibyte character, record that here (ASCII check will use it):
	prev_incomplete = vcgt_u8(input, vuint8x16_t{
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
	});
	prev_input = input;
	}

	// If any error bits are on, this will be > 0
	return vmaxvq_u8(error);
	}