Skip to content

Instantly share code, notes, and snippets.

@jkeiser
Created December 10, 2019 06:38
Show Gist options
  • Save jkeiser/a3dcadd2770fb422d114baa510bbc6c2 to your computer and use it in GitHub Desktop.
Save jkeiser/a3dcadd2770fb422d114baa510bbc6c2 to your computer and use it in GitHub Desktop.
UTF-8 lookup ARM
#include <stdio.h>
#include <stdint.h>
#include <arm_neon.h>
//
// Detect Unicode errors.
//
// UTF-8 is designed to allow multiple bytes and be compatible with ASCII. It's a fairly basic
// encoding that uses the first few bits on each byte to denote a "byte type", and all other bits
// are straight up concatenated into the final value. The first byte of a multibyte character is a
// "leading byte" and starts with N 1's, where N is the total number of bytes (110_____ = 2 byte
// lead). The remaining bytes of a multibyte character all start with 10. 1-byte characters just
// start with 0, because that's what ASCII looks like. Here's what each size
//
// - ASCII (7 bits): 0_______
// - 2 byte character (11 bits): 110_____ 10______
// - 3 byte character (17 bits): 1110____ 10______ 10______
// - 4 byte character (23 bits): 11110___ 10______ 10______ 10______
// - 5+ byte character (illegal): 11111___ <illegal>
//
// There are 5 classes of error that can happen in Unicode:
//
// - TOO_SHORT: when you have a multibyte character with too few bytes (i.e. missing continuation).
// We detect this by looking for new characters (lead bytes) inside the range of a multibyte
// character.
//
// e.g. 11000000 01100001 (2-byte character where second byte is ASCII)
//
// - TOO_LONG: when there are more bytes in your character than you need (i.e. extra continuation).
// We detect this by requiring that the next byte after your multibyte character be a new
// character--so a continuation after your character is wrong.
//
// e.g. 11011111 10111111 10111111 (2-byte character followed by *another* continuation byte)
//
// - TOO_LARGE: Unicode only goes up to U+10FFFF. These characters are too large.
//
// e.g. 11110111 10111111 10111111 10111111 (bigger than 10FFFF).
//
// - OVERLONG: multibyte characters with a bunch of leading zeroes, where you could have
// used fewer bytes to make the same character. Like encoding an ASCII character in 4 bytes is
// technically possible, but UTF-8 disallows it so that there is only one way to write an "a".
//
// e.g. 11000001 10100001 (2-byte encoding of "a", which only requires 1 byte: 01100001)
//
// - SURROGATE: Unicode U+D800-U+DFFF is a *surrogate* character, reserved for use in UCS-2 and
// WTF-8 encodings for characters with > 2 bytes. These are illegal in pure UTF-8.
//
// e.g. 11101101 10100000 10000000 (U+D800)
//
// - INVALID_5_BYTE: 5-byte, 6-byte, 7-byte and 8-byte characters are unsupported; Unicode does not
// support values with more than 23 bits (which a 4-byte character supports).
//
// e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
//
// Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
//
// Code Points 1st 2s 3s 4s
// U+0000..U+007F 00..7F
// U+0080..U+07FF C2..DF 80..BF
// U+0800..U+0FFF E0 A0..BF 80..BF
// U+1000..U+CFFF E1..EC 80..BF 80..BF
// U+D000..U+D7FF ED 80..9F 80..BF
// U+E000..U+FFFF EE..EF 80..BF 80..BF
// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
//
bool check_utf8(const char* buf, size_t len) {
uint8x16_t error;
uint8x16_t prev_input;
uint8x16_t prev_incomplete;
for (size_t i=0; i<len; i+=sizeof(uint8x16_t)) {
uint8x16_t input = vmovq_n_u8(&buf[i]);
//
// Check ASCII
//
if (vmaxvq_u8(input) < 0b10000000u) {
error = vorrq_u8(error, prev_incomplete);
continue;
}
//
// Match the "special case" exceptions like surrogates and overlong encodings using a trio
// of table lookups &'d together.
//
uint8x16_t prev1 = vextq_s8(prev_input, input, 16 - 1);
// These are the errors we're going to match for bytes 1-2, by looking at the first three
// nibbles of the character: <high bits of byte 1>> & <low bits of byte 1> & <high bits of byte 2>
static const int OVERLONG_2 = 0x01; // 1100000_ 10______ (technically we match 10______ but we could match ________, they both yield errors either way)
static const int OVERLONG_3 = 0x02; // 11100000 100_____ ________
static const int OVERLONG_4 = 0x04; // 11110000 1000____ ________ ________
static const int SURROGATE = 0x08; // 11101101 [101_]____
static const int TOO_LARGE = 0x10; // 11110100 (1001|101_)____
static const int TOO_LARGE_2 = 0x20; // 1111(1___|011_|0101) 10______
// After processing the rest of byte 1 (the low bits), we're still not done--we have to check
// byte 2 to be sure which things are errors and which aren't.
// Since high_bits is byte 5, byte 2 is high_bits.prev<3>
static const int CARRY = OVERLONG_2 | TOO_LARGE_2;
uint8x16_t byte_2_high = vqtbl1q_u8(
uint8x16_t{
// ASCII: ________ [0___]____
CARRY, CARRY, CARRY, CARRY,
// ASCII: ________ [0___]____
CARRY, CARRY, CARRY, CARRY,
// Continuations: ________ [10__]____
CARRY | OVERLONG_3 | OVERLONG_4, // ________ [1000]____
CARRY | OVERLONG_3 | TOO_LARGE, // ________ [1001]____
CARRY | TOO_LARGE | SURROGATE, // ________ [1010]____
CARRY | TOO_LARGE | SURROGATE, // ________ [1011]____
// Multibyte Leads: ________ [11__]____
CARRY, CARRY, CARRY, CARRY
},
vshrq_n_u8(input, 4)
);
uint8x16_t byte_1_high = vqtbl1q_u8(
uint8x16_t{
// [0___]____ (ASCII)
0, 0, 0, 0,
0, 0, 0, 0,
// [10__]____ (continuation)
0, 0, 0, 0,
// [11__]____ (2+-byte leads)
OVERLONG_2, 0, // [110_]____ (2-byte lead)
OVERLONG_3 | SURROGATE, // [1110]____ (3-byte lead)
OVERLONG_4 | TOO_LARGE | TOO_LARGE_2 // [1111]____ (4+-byte lead)
},
vshrq_n_u8(prev1, 4)
);
uint8x16_t byte_1_low = vqtlb1q_u8(
uint8x16_t{
// ____[00__] ________
OVERLONG_2 | OVERLONG_3 | OVERLONG_4, // ____[0000] ________
OVERLONG_2, // ____[0001] ________
0, 0,
// ____[01__] ________
TOO_LARGE, // ____[0100] ________
TOO_LARGE_2,
TOO_LARGE_2,
TOO_LARGE_2,
// ____[10__] ________
TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2, TOO_LARGE_2,
// ____[11__] ________
TOO_LARGE_2,
TOO_LARGE_2 | SURROGATE, // ____[1101] ________
TOO_LARGE_2, TOO_LARGE_2
},
vandq_u8(prev1, vdupq_n_u8(0x0F))
)
error = vorrq_u8(error, vandq_u8(byte_1_high, vandq_u8(byte_1_low, byte_2_high)));
//
// Validate that we have continuations iff we want them
//
uint8x16_t prev2 = vextq_s8(prev_input, input, 16 - 2);
uint8x16_t prev3 = vextq_s8(prev_input, input, 16 - 3);
// Cont is 10000000-101111111 (-65...-128). Use signed comparison.
uint8x16_t is_continuation = vcltq_s8(vreinterpretq_s8_u8(input), vdupq_n_s8(-64));
uint8x16_t is_second_byte = vcgeq_u8(prev1, vdupq_n_u8(0b11000000u));
uint8x16_t is_third_byte = vcgeq_u8(prev2, vdupq_n_u8(0b11100000u));
uint8x16_t is_fourth_byte = vcgeq_u8(prev3, vdupq_n_u8(0b11110000u));
// Use ^ instead of | for is_second_byte | is_third_byte | is_fourth_byte, because ^ is
// commutative. We can do this because we only have to report errors for cases with NO lead bytes,
// or exactly ONE, in which case ^ and | yield the same result.
error = vorrq_u8(error,
veorq_u8(
veorq_u8(is_continuation, is_second_byte),
veorq_u8(is_third_byte, is_fourth_byte)
)
);
//
// Prepare next iteration
//
// If there is an unfinished multibyte character, record that here (ASCII check will use it):
prev_incomplete = vcgt_u8(input, vuint8x16_t{
255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
});
prev_input = input;
}
// If any error bits are on, this will be > 0
return vmaxvq_u8(error);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment