Skip to content

Instantly share code, notes, and snippets.

@foxcpp
Created February 2, 2020 20:55
Show Gist options
  • Save foxcpp/db3882d6c3114c4e7f209e8f6db42dd9 to your computer and use it in GitHub Desktop.
Save foxcpp/db3882d6c3114c4e7f209e8f6db42dd9 to your computer and use it in GitHub Desktop.
UTF-8 decoder for C++11 implemented as an iterator
#pragma once
#include <stdexcept>
#include <iterator>
struct utf8_error : public std::runtime_error {
utf8_error(const char* v) : std::runtime_error(v) {}
};
#if defined(__cpp_exceptions) && !defined(UTF8_IT_NOEXCEPT)
# define utf8_it_error(msg) throw utf8_error(msg)
#else
# define utf8_it_error(msg) return;
#endif
/*
* STL-compatible iterator for Unicode code points in UTF-8 encoded strings.
*
* Can be used to convert turn UTF-8 into UTF-32 and to process separate code
* points instead of entire string. It does not provide any functionality for
* the latter, though.
*
* A utf8_iterator instance refers to the pair of begin/end iterators of the
* underlying container. Underlying iterators should satisfy the InputIterator
* concept.
*
* operator++ decodes the code point in the stream, possibly incrementing
* the underlying iterator multiple times. If it hits the past-the-end iterator
* or unexpected octet is hit, utf8_error exception is thrown. If exceptions are
* disabled (either via -fno-exceptions or #define UTF8_IT_NOEXCEPT) - error
* will make used iterator invalid. Dereferencing it will return replacement
* (U+FFFD) code point.
*/
template<typename It>
class utf8_iterator {
public:
template<typename Seq>
utf8_iterator(const Seq& s) : val(0), cur(std::begin(s)), end(std::end(s)) {
// If we are not constructing a past-the-end iterator - decode the
// first character.
if (this->cur != this->end) {
decode_cur();
}
}
utf8_iterator(It begin, It end) : val(0), cur(begin), end(end) {
if (this->cur != this->end) {
decode_cur();
}
}
utf8_iterator(const utf8_iterator&) = default;
utf8_iterator(utf8_iterator&&) = default;
utf8_iterator& operator=(const utf8_iterator&) = default;
utf8_iterator& operator=(utf8_iterator&&) = default;
~utf8_iterator() = default;
using value_type = typename std::iterator_traits<It>::value_type;
using difference_type = typename std::iterator_traits<It>::difference_type;
using reference = const typename std::iterator_traits<It>::reference;
using pointer = const typename std::iterator_traits<It>::pointer;
using iterator_category = std::forward_iterator_tag;
char32_t operator*() const {
return this->val;
}
utf8_iterator& operator++() {
this->val = 0xFFFD;
this->cur++;
// This is now past-the-end iterator, don't decode.
if (this->cur == this->end) {
return *this;
}
decode_cur();
return *this;
}
utf8_iterator operator++(int) {
auto copy = *this;
++(*this);
return copy;
}
bool operator==(const utf8_iterator& other) const {
return this->cur == other.cur;
}
bool operator!=(const utf8_iterator& other) const {
return this->cur != other.cur;
}
It& cur_wrapped() {
return this->cur;
}
It cur, end;
private:
void decode_cur() {
auto octet = *this->cur;
int extra_octets = 0;
// One could use <= here to check prefix, but it needs reinterpret_case
// trickery to avoid getting confused by negative signed char values.
// E.g. '\xD0' (208 or -46) < '\x7F' (127)
if ((octet & 0b10000000) == 0) {
this->val = octet;
} else if ((octet & 0b11000000) == 0b10000000) {
utf8_it_error("unexpected continuation octet");
return;
} else if ((octet & 0b11100000) == 0b11000000) {
this->val = octet & 0b00011111;
extra_octets = 1;
} else if ((octet & 0b11110000) == 0b11100000) {
this->val = octet & 0b00001111;
extra_octets = 2;
} else if ((octet & 0b11110000) == 0b11110000) {
this->val = octet & 0b00000111;
extra_octets = 3;
}
// Step to the next byte so it will be decoded next.
++this->cur;
// Consume continuation octets if necessary.
while (extra_octets > 0) {
if (this->cur == this->end) {
utf8_it_error("unexpected end of sequence");
}
// Check whether it is a valid continuation octet.
octet = *this->cur;
if ((octet & uint8_t(0b10000000)) != uint8_t(0b10000000)) {
utf8_it_error("invalid continuation octet");
}
// 'Append' last 6 bits from continuation octet to the code point value.
this->val <<= 6;
this->val |= (octet & uint8_t(0b00111111));
// Make sure at the end iterator points to the latest octet of character.
// This allows us to compare iterators with each other and also
// with past-the-end iterator correctly.
--extra_octets;
if (extra_octets != 0) {
++this->cur;
}
}
}
char32_t val;
};
#if __cplusplus >= 201703L
template<typename T>
utf8_iterator(const T&) -> utf8_iterator<typename T::iterator>;
#endif
/*
* Wrapper for convenient use of utf8_iterator in for-each loops.
*
* ```
* for (auto ch : utf8_runes("test")) {
* std::cout << ch;
* }
* ```
*/
template<typename It>
struct utf8_runes {
template<typename T>
utf8_runes(const T& t) : _begin(std::begin(t), std::end(t)), _end(std::end(t), std::end(t)) {}
utf8_runes(It begin, It end) : _begin(begin), _end(end) {}
utf8_iterator<It> begin() {
return _begin;
}
utf8_iterator<It> end() {
return _end;
}
private:
utf8_iterator<It> _begin, _end;
};
#if __cplusplus >= 201703L
template<typename T>
utf8_runes(const T&) -> utf8_runes<typename T::iterator>;
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment