Created
February 2, 2020 20:55
-
-
Save foxcpp/db3882d6c3114c4e7f209e8f6db42dd9 to your computer and use it in GitHub Desktop.
UTF-8 decoder for C++11 implemented as an iterator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <stdexcept> | |
#include <iterator> | |
struct utf8_error : public std::runtime_error { | |
utf8_error(const char* v) : std::runtime_error(v) {} | |
}; | |
#if defined(__cpp_exceptions) && !defined(UTF8_IT_NOEXCEPT) | |
# define utf8_it_error(msg) throw utf8_error(msg) | |
#else | |
# define utf8_it_error(msg) return; | |
#endif | |
/* | |
* STL-compatible iterator for Unicode code points in UTF-8 encoded strings. | |
* | |
* Can be used to convert turn UTF-8 into UTF-32 and to process separate code | |
* points instead of entire string. It does not provide any functionality for | |
* the latter, though. | |
* | |
* A utf8_iterator instance refers to the pair of begin/end iterators of the | |
* underlying container. Underlying iterators should satisfy the InputIterator | |
* concept. | |
* | |
* operator++ decodes the code point in the stream, possibly incrementing | |
* the underlying iterator multiple times. If it hits the past-the-end iterator | |
* or unexpected octet is hit, utf8_error exception is thrown. If exceptions are | |
* disabled (either via -fno-exceptions or #define UTF8_IT_NOEXCEPT) - error | |
* will make used iterator invalid. Dereferencing it will return replacement | |
* (U+FFFD) code point. | |
*/ | |
template<typename It> | |
class utf8_iterator { | |
public: | |
template<typename Seq> | |
utf8_iterator(const Seq& s) : val(0), cur(std::begin(s)), end(std::end(s)) { | |
// If we are not constructing a past-the-end iterator - decode the | |
// first character. | |
if (this->cur != this->end) { | |
decode_cur(); | |
} | |
} | |
utf8_iterator(It begin, It end) : val(0), cur(begin), end(end) { | |
if (this->cur != this->end) { | |
decode_cur(); | |
} | |
} | |
utf8_iterator(const utf8_iterator&) = default; | |
utf8_iterator(utf8_iterator&&) = default; | |
utf8_iterator& operator=(const utf8_iterator&) = default; | |
utf8_iterator& operator=(utf8_iterator&&) = default; | |
~utf8_iterator() = default; | |
using value_type = typename std::iterator_traits<It>::value_type; | |
using difference_type = typename std::iterator_traits<It>::difference_type; | |
using reference = const typename std::iterator_traits<It>::reference; | |
using pointer = const typename std::iterator_traits<It>::pointer; | |
using iterator_category = std::forward_iterator_tag; | |
char32_t operator*() const { | |
return this->val; | |
} | |
utf8_iterator& operator++() { | |
this->val = 0xFFFD; | |
this->cur++; | |
// This is now past-the-end iterator, don't decode. | |
if (this->cur == this->end) { | |
return *this; | |
} | |
decode_cur(); | |
return *this; | |
} | |
utf8_iterator operator++(int) { | |
auto copy = *this; | |
++(*this); | |
return copy; | |
} | |
bool operator==(const utf8_iterator& other) const { | |
return this->cur == other.cur; | |
} | |
bool operator!=(const utf8_iterator& other) const { | |
return this->cur != other.cur; | |
} | |
It& cur_wrapped() { | |
return this->cur; | |
} | |
It cur, end; | |
private: | |
void decode_cur() { | |
auto octet = *this->cur; | |
int extra_octets = 0; | |
// One could use <= here to check prefix, but it needs reinterpret_case | |
// trickery to avoid getting confused by negative signed char values. | |
// E.g. '\xD0' (208 or -46) < '\x7F' (127) | |
if ((octet & 0b10000000) == 0) { | |
this->val = octet; | |
} else if ((octet & 0b11000000) == 0b10000000) { | |
utf8_it_error("unexpected continuation octet"); | |
return; | |
} else if ((octet & 0b11100000) == 0b11000000) { | |
this->val = octet & 0b00011111; | |
extra_octets = 1; | |
} else if ((octet & 0b11110000) == 0b11100000) { | |
this->val = octet & 0b00001111; | |
extra_octets = 2; | |
} else if ((octet & 0b11110000) == 0b11110000) { | |
this->val = octet & 0b00000111; | |
extra_octets = 3; | |
} | |
// Step to the next byte so it will be decoded next. | |
++this->cur; | |
// Consume continuation octets if necessary. | |
while (extra_octets > 0) { | |
if (this->cur == this->end) { | |
utf8_it_error("unexpected end of sequence"); | |
} | |
// Check whether it is a valid continuation octet. | |
octet = *this->cur; | |
if ((octet & uint8_t(0b10000000)) != uint8_t(0b10000000)) { | |
utf8_it_error("invalid continuation octet"); | |
} | |
// 'Append' last 6 bits from continuation octet to the code point value. | |
this->val <<= 6; | |
this->val |= (octet & uint8_t(0b00111111)); | |
// Make sure at the end iterator points to the latest octet of character. | |
// This allows us to compare iterators with each other and also | |
// with past-the-end iterator correctly. | |
--extra_octets; | |
if (extra_octets != 0) { | |
++this->cur; | |
} | |
} | |
} | |
char32_t val; | |
}; | |
#if __cplusplus >= 201703L | |
template<typename T> | |
utf8_iterator(const T&) -> utf8_iterator<typename T::iterator>; | |
#endif | |
/* | |
* Wrapper for convenient use of utf8_iterator in for-each loops. | |
* | |
* ``` | |
* for (auto ch : utf8_runes("test")) { | |
* std::cout << ch; | |
* } | |
* ``` | |
*/ | |
template<typename It> | |
struct utf8_runes { | |
template<typename T> | |
utf8_runes(const T& t) : _begin(std::begin(t), std::end(t)), _end(std::end(t), std::end(t)) {} | |
utf8_runes(It begin, It end) : _begin(begin), _end(end) {} | |
utf8_iterator<It> begin() { | |
return _begin; | |
} | |
utf8_iterator<It> end() { | |
return _end; | |
} | |
private: | |
utf8_iterator<It> _begin, _end; | |
}; | |
#if __cplusplus >= 201703L | |
template<typename T> | |
utf8_runes(const T&) -> utf8_runes<typename T::iterator>; | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment