Skip to content

Instantly share code, notes, and snippets.

@jridgewell
Last active December 5, 2017 00:18
Show Gist options
  • Save jridgewell/e93a0d4623adcff310b8d16c579d7bd2 to your computer and use it in GitHub Desktop.
Save jridgewell/e93a0d4623adcff310b8d16c579d7bd2 to your computer and use it in GitHub Desktop.
A reworked version of Bjoern Hoehrmann's DFA decoder, skipping the ternary.
#define UTF8_ACCEPT 12
#define UTF8_REJECT 0
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
// Reworked table, justification at https://docs.google.com/spreadsheets/d/1AZcQwuEL93HmNCljJWUwFMGqf7JAQ0puawZaUgP0E14
static const uint8_t utf8d[] = {
// The first part of the table maps bytes to character to a transition.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00-0F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10-1F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20-2F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30-3F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40-4F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50-5F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60-6F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70-7F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80-8F
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 90-9F
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // A0-AF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // B0-BF
9, 9, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // C0-CF
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // D0-DF
10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, // E0-EF
11, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // F0-FF
// The second part of the table maps a state to a new state when adding a
// transition.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // REJECT
12, 0, 0, 0, 24, 36, 48, 60, 72, 0, 84, 96, // ACCEPT
0, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, // 2-byte continue
0, 24, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte continue
0, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte low/mid continue
0, 36, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte continue
0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte low continue
0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte high continue
0, 0, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte mid/high continue
};
static inline void
decode(const uint8_t byte, uint8_t* const state, uint32_t* const codep) {
const uint8_t type = utf8d[byte];
*state = utf8d[256 + *state + type];
*codep = (*codep << 6) | (byte & (0x7F >> (type >> 1)));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment