Last active
December 5, 2017 00:18
-
-
Save jridgewell/e93a0d4623adcff310b8d16c579d7bd2 to your computer and use it in GitHub Desktop.
A reworked version of Bjoern Hoehrmann's DFA decoder, skipping the ternary.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define UTF8_ACCEPT 12 | |
#define UTF8_REJECT 0 | |
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | |
// Reworked table, justification at https://docs.google.com/spreadsheets/d/1AZcQwuEL93HmNCljJWUwFMGqf7JAQ0puawZaUgP0E14 | |
static const uint8_t utf8d[] = { | |
// The first part of the table maps bytes to character to a transition. | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00-0F | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10-1F | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20-2F | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30-3F | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40-4F | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50-5F | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60-6F | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70-7F | |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80-8F | |
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 90-9F | |
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // A0-AF | |
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // B0-BF | |
9, 9, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // C0-CF | |
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // D0-DF | |
10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, // E0-EF | |
11, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // F0-FF | |
// The second part of the table maps a state to a new state when adding a | |
// transition. | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // REJECT | |
12, 0, 0, 0, 24, 36, 48, 60, 72, 0, 84, 96, // ACCEPT | |
0, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, // 2-byte continue | |
0, 24, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte continue | |
0, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte low/mid continue | |
0, 36, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte continue | |
0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte low continue | |
0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte high continue | |
0, 0, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte mid/high continue | |
}; | |
static inline void | |
decode(const uint8_t byte, uint8_t* const state, uint32_t* const codep) { | |
const uint8_t type = utf8d[byte]; | |
*state = utf8d[256 + *state + type]; | |
*codep = (*codep << 6) | (byte & (0x7F >> (type >> 1))); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment