Created
November 21, 2022 15:55
-
-
Save vtnerd/e1f065eb3fc1e523b5bc32c573593663 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdint> | |
#include <iomanip> | |
#include <iostream> | |
#include <iterator> | |
#include <boost/range/iterator_range.hpp> | |
#include <boost/spirit/home/support/char_encoding/unicode.hpp> | |
namespace { | |
struct multi_byte_info { | |
std::uint8_t id_mask; | |
std::uint8_t id_matcher; | |
std::uint8_t data_mask; | |
}; | |
constexpr const std::uint8_t multi_byte_id_mask = 0xC0; | |
constexpr const std::uint8_t multi_byte_id_matcher = 0x80; | |
constexpr const std::uint8_t multi_byte_data_mask = 0x3F; | |
constexpr const std::uint8_t multi_byte_bits = 6; | |
constexpr const multi_byte_info multi_byte_infos[] = { | |
// skip 1 byte info | |
{0xE0, 0xC0, 0x1F}, | |
{0xF0, 0xE0, 0x0F}, | |
{0xF8, 0xF0, 0x07}}; | |
constexpr const unsigned max_length = | |
(sizeof(multi_byte_infos) / sizeof(multi_byte_info)); | |
constexpr const std::uint32_t overlong[] = {0x80, 0x800, 0x10000}; | |
constexpr const std::uint32_t max_code_point = 0x10FFFF; | |
} | |
enum class extraction : std::uint8_t { success, failure }; | |
struct extraction_attempt { | |
std::uint32_t code_point; | |
std::uint8_t bytes_processed; | |
extraction status; | |
}; | |
template <typename Iterator> | |
constexpr extraction_attempt next_code_point(Iterator position, | |
const Iterator &end) { | |
static_assert( | |
std::is_same<typename std::iterator_traits<Iterator>::iterator_category, | |
std::random_access_iterator_tag>{}, | |
"bad iterator type"); | |
extraction_attempt result{0, 0, extraction::failure}; | |
if (end - position) { | |
result.code_point = std::uint8_t(*position); | |
++position; | |
++result.bytes_processed; | |
if (0x7F < result.code_point) { | |
unsigned expected_length = 1; | |
for (const auto info : multi_byte_infos) { | |
if ((result.code_point & info.id_mask) == info.id_matcher) { | |
result.code_point &= info.data_mask; | |
break; | |
} | |
++expected_length; | |
} | |
if (max_length < expected_length || (end - position) < expected_length) { | |
return result; | |
} | |
for (unsigned byte = 0; byte < expected_length; ++byte) { | |
const std::uint8_t next_byte = *(position + byte); | |
if ((next_byte & multi_byte_id_mask) != multi_byte_id_matcher) { | |
return result; | |
} | |
result.code_point <<= multi_byte_bits; | |
result.code_point |= (next_byte & multi_byte_data_mask); | |
++result.bytes_processed; | |
} | |
if (max_code_point < result.code_point) { | |
return result; | |
} | |
if (overlong[expected_length - 1] > result.code_point) { | |
return result; | |
} | |
} | |
result.status = extraction::success; | |
} // end multi-byte processing | |
return result; | |
} | |
template <typename Range> | |
constexpr extraction_attempt next_code_point(const Range &range) { | |
return next_code_point(std::begin(range), std::end(range)); | |
} | |
template <typename T> | |
boost::iterator_range<T> | |
next_character_bytes(const boost::iterator_range<T> &range, | |
const extraction_attempt result) { | |
return boost::make_iterator_range(range.begin(), | |
range.begin() + result.bytes_processed); | |
} | |
template <std::size_t Length> | |
constexpr bool test(const char (&range)[Length], | |
const extraction expected_status, | |
const std::uint32_t expected_code_point, | |
const std::uint8_t expected_bytes_processed) { | |
const extraction_attempt result = | |
next_code_point(std::begin(range), std::end(range) - 1); | |
switch (expected_status) { | |
case extraction::success: | |
return result.status == extraction::success && | |
result.bytes_processed == expected_bytes_processed && | |
result.code_point == expected_code_point; | |
case extraction::failure: | |
return result.status == extraction::failure && | |
result.bytes_processed == expected_bytes_processed; | |
default: | |
return false; | |
} | |
} | |
int main() { | |
static_assert(test("F", extraction::success, 'F', 1), ""); | |
static_assert(test("\0", extraction::success, 0, 1), ""); | |
static_assert(test("\x7F", extraction::success, 0x7F, 1), ""); | |
static_assert(test("\xFF\xFF", extraction::failure, 0, 1), ""); | |
static_assert(test("\xDF", extraction::failure, 0, 1), ""); | |
static_assert(test("\xDF\xFF", extraction::failure, 0, 1), ""); | |
static_assert(test("\xC1\xBF", extraction::failure, 0, 2), ""); | |
static_assert(test("\xC2\x80", extraction::success, 0x80, 2), ""); | |
static_assert(test("\xDF\xBF", extraction::success, 0x07FF, 2), ""); | |
static_assert(test("\xEF\xBF", extraction::failure, 0, 1), ""); | |
static_assert(test("\xEF\xBF\xFF", extraction::failure, 0, 2), ""); | |
static_assert(test("\xE0\x9F\xBF", extraction::failure, 0, 3), ""); | |
static_assert(test("\xE0\xA0\x80", extraction::success, 0x800, 3), ""); | |
static_assert(test("\xEF\xBF\xBF", extraction::success, 0xFFFF, 3), ""); | |
static_assert(test("\xF7\xBF\xBF", extraction::failure, 0, 1), ""); | |
static_assert(test("\xF7\xBF\xBF\xFF", extraction::failure, 0, 3), ""); | |
static_assert(test("\xF0\x8F\xBF\xBF", extraction::failure, 0, 4), ""); | |
static_assert(test("\xF0\x90\x80\x80", extraction::success, 0x10000, 4), ""); | |
static_assert(test("\xF4\x8F\xBF\xBF", extraction::success, 0x10FFFF, 4), ""); | |
static_assert(test("\xF7\xBF\xBF\xBF", extraction::failure, 0, 4), ""); | |
static_assert(test("𝕫", extraction::success, 0x1D56B, 4), ""); | |
constexpr const static char text[] = | |
"Hello あにま ➦ 👙 𝕫⊆𝕢 \x02\x01\b \xff\xff\xff "; | |
std::cout << text << std::endl; | |
auto data = boost::make_iterator_range(text); | |
while (!data.empty()) { | |
const extraction_attempt result = next_code_point(data); | |
switch (result.status) { | |
case extraction::success: | |
if (boost::spirit::char_encoding::unicode::isprint(result.code_point)) { | |
std::cout << next_character_bytes(data, result); | |
break; | |
} | |
default: | |
case extraction::failure: | |
std::cout << "["; | |
std::cout << std::hex << std::setw(2) << std::setfill('0'); | |
for (const auto byte : next_character_bytes(data, result)) { | |
std::cout << int(std::uint8_t(byte)); | |
} | |
std::cout << "]"; | |
break; | |
} | |
data.advance_begin(result.bytes_processed); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment