Skip to content

Instantly share code, notes, and snippets.

@jessestricker
Created August 18, 2017 21:46
Show Gist options
  • Save jessestricker/3afeb272a40f7feabb17be7f7bbfa220 to your computer and use it in GitHub Desktop.
Save jessestricker/3afeb272a40f7feabb17be7f7bbfa220 to your computer and use it in GitHub Desktop.
UTF-8 Decoding in C++
#include "utf8.hpp"
namespace {
template<class T>
struct range {
T low, high;
constexpr bool is_in(T value) const {
return low <= value && value <= high;
}
};
using u8range = range<std::uint8_t>;
}
namespace utf8 {
constexpr u8range cb_simple{0x80, 0xBF};
u8range get_b1_range(std::uint8_t b0) {
u8range rng = cb_simple;
if (b0 == 0xE0) rng.low = 0xA0;
else if (b0 == 0xED) rng.high = 0x9F;
else if (b0 == 0xF0) rng.low = 0x90;
else if (b0 == 0xF4) rng.high = 0x8F;
return rng;
}
}
char32_t utf8::decode(const std::uint8_t*& first, const std::uint8_t* last, utf8::decode_error& error) {
constexpr u8range b0_single{0x00, 0x7F};
constexpr u8range b0_valid_multi{0xC2, 0xF4};
// reset error
error = decode_error::none;
// read first byte
if (first == last) {
error = decode_error::end_of_data;
return 0;
}
const std::uint8_t b0 = *first;
// advance
++first;
if (b0_single.is_in(b0)) { // single byte code point
return b0;
}
// b0 ∈ [0x80, 0xFF]
if (!b0_valid_multi.is_in(b0)) { // invalid first byte
error = decode_error::invalid_sequence;
return replacement_char;
}
// b0 ∈ [0xC2, 0xF4]
// read second byte
if (first == last) {
error = decode_error::invalid_sequence;
return replacement_char;
}
const std::uint8_t b1 = *first;
// check second byte
const u8range b1_valid = get_b1_range(b0);
if (!b1_valid.is_in(b1)) { // invalid second byte
error = decode_error::invalid_sequence;
return replacement_char;
}
// advance
++first;
if (b0 <= 0xDF) { // two byte sequence
return static_cast<char32_t>(b0 & 0b0001'1111) << 6 |
static_cast<char32_t>(b1 & 0b0011'1111);
}
// b0 ∈ [0xE0, 0xF4]
// read third byte
if (first == last) {
error = decode_error::invalid_sequence;
return replacement_char;
}
const std::uint8_t b2 = *first;
// check third byte
if (!cb_simple.is_in(b2)) { // invalid third byte
error = decode_error::invalid_sequence;
return replacement_char;
}
// advance
++first;
if (b0 <= 0xEF) { // three byte sequence
return static_cast<char32_t>(b0 & 0b0000'1111) << 12 |
static_cast<char32_t>(b1 & 0b0011'1111) << 06 |
static_cast<char32_t>(b2 & 0b0011'1111);
}
// b0 ∈ [0xF0, 0xF4]
// read fourth byte
if (first == last) {
error = decode_error::invalid_sequence;
return replacement_char;
}
const std::uint8_t b3 = *first;
// check fourth byte
if (!cb_simple.is_in(b3)) { // invalid fourth byte
error = decode_error::invalid_sequence;
return replacement_char;
}
// advance
++first;
// four byte sequence
return static_cast<char32_t>(b0 & 0b0000'0111) << 18 |
static_cast<char32_t>(b1 & 0b0011'1111) << 12 |
static_cast<char32_t>(b2 & 0b0011'1111) << 06 |
static_cast<char32_t>(b3 & 0b0011'1111);
}
#pragma once
#include <cstddef> // std::size_t
#include <cstdint> // std::uint8_t
namespace utf8 {
enum class decode_error {
none,
invalid_sequence,
end_of_data
};
constexpr char32_t replacement_char{0xFFFD};
constexpr char32_t max_char{0x10FFFF};
constexpr std::size_t max_sequence_length{4};
char32_t decode(const std::uint8_t*& first, const std::uint8_t* last, decode_error& error);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment