jridgewell/utf8-dfa-decoder.c

## utf8-dfa-decoder.c
#define UTF8_ACCEPT 12
#define UTF8_REJECT 0

// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
// Reworked table, justification at https://docs.google.com/spreadsheets/d/1AZcQwuEL93HmNCljJWUwFMGqf7JAQ0puawZaUgP0E14
static const uint8_t utf8d[] = {
      // The first part of the table maps bytes to character to a transition.
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00-0F
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10-1F
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20-2F
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30-3F
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40-4F
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50-5F
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60-6F
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70-7F
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80-8F
      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 90-9F
      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // A0-AF
      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // B0-BF
      9, 9, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // C0-CF
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // D0-DF
     10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, // E0-EF
     11, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // F0-FF

      // The second part of the table maps a state to a new state when adding a
      // transition.
      0,  0,  0,  0,  0,  0,  0,  0,  0, 0,  0,  0, // REJECT
     12,  0,  0,  0, 24, 36, 48, 60, 72, 0, 84, 96, // ACCEPT
      0, 12, 12, 12,  0,  0,  0,  0,  0, 0,  0,  0, // 2-byte continue
      0, 24, 24, 24,  0,  0,  0,  0,  0, 0,  0,  0, // 3-byte continue
      0, 24, 24,  0,  0,  0,  0,  0,  0, 0,  0,  0, // 3-byte low/mid continue
      0, 36, 36, 36,  0,  0,  0,  0,  0, 0,  0,  0, // 4-byte continue
      0, 36,  0,  0,  0,  0,  0,  0,  0, 0,  0,  0, // 4-byte low continue
      0,  0,  0, 24,  0,  0,  0,  0,  0, 0,  0,  0, // 3-byte high continue
      0,  0, 36, 36,  0,  0,  0,  0,  0, 0,  0,  0, // 4-byte mid/high continue
};

static inline void
decode(const uint8_t byte, uint8_t* const state, uint32_t* const codep) {
  const uint8_t type = utf8d[byte];
  *state = utf8d[256 + *state + type];
  *codep = (*codep << 6) | (byte & (0x7F >> (type >> 1)));
}
	#define UTF8_ACCEPT 12
	#define UTF8_REJECT 0

	// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
	// Reworked table, justification at https://docs.google.com/spreadsheets/d/1AZcQwuEL93HmNCljJWUwFMGqf7JAQ0puawZaUgP0E14
	static const uint8_t utf8d[] = {
	// The first part of the table maps bytes to character to a transition.
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00-0F
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10-1F
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20-2F
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30-3F
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40-4F
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50-5F
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60-6F
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70-7F
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80-8F
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 90-9F
	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // A0-AF
	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // B0-BF
	9, 9, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // C0-CF
	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // D0-DF
	10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, // E0-EF
	11, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // F0-FF

	// The second part of the table maps a state to a new state when adding a
	// transition.
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // REJECT
	12, 0, 0, 0, 24, 36, 48, 60, 72, 0, 84, 96, // ACCEPT
	0, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, // 2-byte continue
	0, 24, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte continue
	0, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte low/mid continue
	0, 36, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte continue
	0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte low continue
	0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte high continue
	0, 0, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte mid/high continue
	};

	static inline void
	decode(const uint8_t byte, uint8_t* const state, uint32_t* const codep) {
	const uint8_t type = utf8d[byte];
	state = utf8d[256 + state + type];
	codep = (codep << 6) \| (byte & (0x7F >> (type >> 1)));
	}