Skip to content

Instantly share code, notes, and snippets.

@vtnerd
Created November 21, 2022 15:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vtnerd/e1f065eb3fc1e523b5bc32c573593663 to your computer and use it in GitHub Desktop.
Save vtnerd/e1f065eb3fc1e523b5bc32c573593663 to your computer and use it in GitHub Desktop.
#include <cstdint>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <boost/range/iterator_range.hpp>
#include <boost/spirit/home/support/char_encoding/unicode.hpp>
namespace {
struct multi_byte_info {
std::uint8_t id_mask;
std::uint8_t id_matcher;
std::uint8_t data_mask;
};
constexpr const std::uint8_t multi_byte_id_mask = 0xC0;
constexpr const std::uint8_t multi_byte_id_matcher = 0x80;
constexpr const std::uint8_t multi_byte_data_mask = 0x3F;
constexpr const std::uint8_t multi_byte_bits = 6;
constexpr const multi_byte_info multi_byte_infos[] = {
// skip 1 byte info
{0xE0, 0xC0, 0x1F},
{0xF0, 0xE0, 0x0F},
{0xF8, 0xF0, 0x07}};
constexpr const unsigned max_length =
(sizeof(multi_byte_infos) / sizeof(multi_byte_info));
constexpr const std::uint32_t overlong[] = {0x80, 0x800, 0x10000};
constexpr const std::uint32_t max_code_point = 0x10FFFF;
}
enum class extraction : std::uint8_t { success, failure };
struct extraction_attempt {
std::uint32_t code_point;
std::uint8_t bytes_processed;
extraction status;
};
template <typename Iterator>
constexpr extraction_attempt next_code_point(Iterator position,
const Iterator &end) {
static_assert(
std::is_same<typename std::iterator_traits<Iterator>::iterator_category,
std::random_access_iterator_tag>{},
"bad iterator type");
extraction_attempt result{0, 0, extraction::failure};
if (end - position) {
result.code_point = std::uint8_t(*position);
++position;
++result.bytes_processed;
if (0x7F < result.code_point) {
unsigned expected_length = 1;
for (const auto info : multi_byte_infos) {
if ((result.code_point & info.id_mask) == info.id_matcher) {
result.code_point &= info.data_mask;
break;
}
++expected_length;
}
if (max_length < expected_length || (end - position) < expected_length) {
return result;
}
for (unsigned byte = 0; byte < expected_length; ++byte) {
const std::uint8_t next_byte = *(position + byte);
if ((next_byte & multi_byte_id_mask) != multi_byte_id_matcher) {
return result;
}
result.code_point <<= multi_byte_bits;
result.code_point |= (next_byte & multi_byte_data_mask);
++result.bytes_processed;
}
if (max_code_point < result.code_point) {
return result;
}
if (overlong[expected_length - 1] > result.code_point) {
return result;
}
}
result.status = extraction::success;
} // end multi-byte processing
return result;
}
template <typename Range>
constexpr extraction_attempt next_code_point(const Range &range) {
return next_code_point(std::begin(range), std::end(range));
}
template <typename T>
boost::iterator_range<T>
next_character_bytes(const boost::iterator_range<T> &range,
const extraction_attempt result) {
return boost::make_iterator_range(range.begin(),
range.begin() + result.bytes_processed);
}
template <std::size_t Length>
constexpr bool test(const char (&range)[Length],
const extraction expected_status,
const std::uint32_t expected_code_point,
const std::uint8_t expected_bytes_processed) {
const extraction_attempt result =
next_code_point(std::begin(range), std::end(range) - 1);
switch (expected_status) {
case extraction::success:
return result.status == extraction::success &&
result.bytes_processed == expected_bytes_processed &&
result.code_point == expected_code_point;
case extraction::failure:
return result.status == extraction::failure &&
result.bytes_processed == expected_bytes_processed;
default:
return false;
}
}
int main() {
static_assert(test("F", extraction::success, 'F', 1), "");
static_assert(test("\0", extraction::success, 0, 1), "");
static_assert(test("\x7F", extraction::success, 0x7F, 1), "");
static_assert(test("\xFF\xFF", extraction::failure, 0, 1), "");
static_assert(test("\xDF", extraction::failure, 0, 1), "");
static_assert(test("\xDF\xFF", extraction::failure, 0, 1), "");
static_assert(test("\xC1\xBF", extraction::failure, 0, 2), "");
static_assert(test("\xC2\x80", extraction::success, 0x80, 2), "");
static_assert(test("\xDF\xBF", extraction::success, 0x07FF, 2), "");
static_assert(test("\xEF\xBF", extraction::failure, 0, 1), "");
static_assert(test("\xEF\xBF\xFF", extraction::failure, 0, 2), "");
static_assert(test("\xE0\x9F\xBF", extraction::failure, 0, 3), "");
static_assert(test("\xE0\xA0\x80", extraction::success, 0x800, 3), "");
static_assert(test("\xEF\xBF\xBF", extraction::success, 0xFFFF, 3), "");
static_assert(test("\xF7\xBF\xBF", extraction::failure, 0, 1), "");
static_assert(test("\xF7\xBF\xBF\xFF", extraction::failure, 0, 3), "");
static_assert(test("\xF0\x8F\xBF\xBF", extraction::failure, 0, 4), "");
static_assert(test("\xF0\x90\x80\x80", extraction::success, 0x10000, 4), "");
static_assert(test("\xF4\x8F\xBF\xBF", extraction::success, 0x10FFFF, 4), "");
static_assert(test("\xF7\xBF\xBF\xBF", extraction::failure, 0, 4), "");
static_assert(test("𝕫", extraction::success, 0x1D56B, 4), "");
constexpr const static char text[] =
"Hello あにま ➦ 👙 𝕫⊆𝕢 \x02\x01\b \xff\xff\xff ";
std::cout << text << std::endl;
auto data = boost::make_iterator_range(text);
while (!data.empty()) {
const extraction_attempt result = next_code_point(data);
switch (result.status) {
case extraction::success:
if (boost::spirit::char_encoding::unicode::isprint(result.code_point)) {
std::cout << next_character_bytes(data, result);
break;
}
default:
case extraction::failure:
std::cout << "[";
std::cout << std::hex << std::setw(2) << std::setfill('0');
for (const auto byte : next_character_bytes(data, result)) {
std::cout << int(std::uint8_t(byte));
}
std::cout << "]";
break;
}
data.advance_begin(result.bytes_processed);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment