Created
August 18, 2017 21:46
-
-
Save jessestricker/3afeb272a40f7feabb17be7f7bbfa220 to your computer and use it in GitHub Desktop.
UTF-8 Decoding in C++
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "utf8.hpp" | |
namespace { | |
template<class T> | |
struct range { | |
T low, high; | |
constexpr bool is_in(T value) const { | |
return low <= value && value <= high; | |
} | |
}; | |
using u8range = range<std::uint8_t>; | |
} | |
namespace utf8 { | |
constexpr u8range cb_simple{0x80, 0xBF}; | |
u8range get_b1_range(std::uint8_t b0) { | |
u8range rng = cb_simple; | |
if (b0 == 0xE0) rng.low = 0xA0; | |
else if (b0 == 0xED) rng.high = 0x9F; | |
else if (b0 == 0xF0) rng.low = 0x90; | |
else if (b0 == 0xF4) rng.high = 0x8F; | |
return rng; | |
} | |
} | |
char32_t utf8::decode(const std::uint8_t*& first, const std::uint8_t* last, utf8::decode_error& error) { | |
constexpr u8range b0_single{0x00, 0x7F}; | |
constexpr u8range b0_valid_multi{0xC2, 0xF4}; | |
// reset error | |
error = decode_error::none; | |
// read first byte | |
if (first == last) { | |
error = decode_error::end_of_data; | |
return 0; | |
} | |
const std::uint8_t b0 = *first; | |
// advance | |
++first; | |
if (b0_single.is_in(b0)) { // single byte code point | |
return b0; | |
} | |
// b0 ∈ [0x80, 0xFF] | |
if (!b0_valid_multi.is_in(b0)) { // invalid first byte | |
error = decode_error::invalid_sequence; | |
return replacement_char; | |
} | |
// b0 ∈ [0xC2, 0xF4] | |
// read second byte | |
if (first == last) { | |
error = decode_error::invalid_sequence; | |
return replacement_char; | |
} | |
const std::uint8_t b1 = *first; | |
// check second byte | |
const u8range b1_valid = get_b1_range(b0); | |
if (!b1_valid.is_in(b1)) { // invalid second byte | |
error = decode_error::invalid_sequence; | |
return replacement_char; | |
} | |
// advance | |
++first; | |
if (b0 <= 0xDF) { // two byte sequence | |
return static_cast<char32_t>(b0 & 0b0001'1111) << 6 | | |
static_cast<char32_t>(b1 & 0b0011'1111); | |
} | |
// b0 ∈ [0xE0, 0xF4] | |
// read third byte | |
if (first == last) { | |
error = decode_error::invalid_sequence; | |
return replacement_char; | |
} | |
const std::uint8_t b2 = *first; | |
// check third byte | |
if (!cb_simple.is_in(b2)) { // invalid third byte | |
error = decode_error::invalid_sequence; | |
return replacement_char; | |
} | |
// advance | |
++first; | |
if (b0 <= 0xEF) { // three byte sequence | |
return static_cast<char32_t>(b0 & 0b0000'1111) << 12 | | |
static_cast<char32_t>(b1 & 0b0011'1111) << 06 | | |
static_cast<char32_t>(b2 & 0b0011'1111); | |
} | |
// b0 ∈ [0xF0, 0xF4] | |
// read fourth byte | |
if (first == last) { | |
error = decode_error::invalid_sequence; | |
return replacement_char; | |
} | |
const std::uint8_t b3 = *first; | |
// check fourth byte | |
if (!cb_simple.is_in(b3)) { // invalid fourth byte | |
error = decode_error::invalid_sequence; | |
return replacement_char; | |
} | |
// advance | |
++first; | |
// four byte sequence | |
return static_cast<char32_t>(b0 & 0b0000'0111) << 18 | | |
static_cast<char32_t>(b1 & 0b0011'1111) << 12 | | |
static_cast<char32_t>(b2 & 0b0011'1111) << 06 | | |
static_cast<char32_t>(b3 & 0b0011'1111); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <cstddef> // std::size_t | |
#include <cstdint> // std::uint8_t | |
namespace utf8 { | |
enum class decode_error { | |
none, | |
invalid_sequence, | |
end_of_data | |
}; | |
constexpr char32_t replacement_char{0xFFFD}; | |
constexpr char32_t max_char{0x10FFFF}; | |
constexpr std::size_t max_sequence_length{4}; | |
char32_t decode(const std::uint8_t*& first, const std::uint8_t* last, decode_error& error); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment