foxcpp/utf8_iterator.hpp

## utf8_iterator.hpp
#pragma once

#include <stdexcept>
#include <iterator>

struct utf8_error : public std::runtime_error {
    utf8_error(const char* v) : std::runtime_error(v) {}
};

#if defined(__cpp_exceptions) && !defined(UTF8_IT_NOEXCEPT)
#   define utf8_it_error(msg) throw utf8_error(msg)
#else
#   define utf8_it_error(msg) return;
#endif

/*
 * STL-compatible iterator for Unicode code points in UTF-8 encoded strings.
 *
 * Can be used to convert turn UTF-8 into UTF-32 and to process separate code
 * points instead of entire string. It does not provide any functionality for
 * the latter, though.
 *
 * A utf8_iterator instance refers to the pair of begin/end iterators of the
 * underlying container. Underlying iterators should satisfy the InputIterator
 * concept.
 *
 * operator++ decodes the code point in the stream, possibly incrementing
 * the underlying iterator multiple times. If it hits the past-the-end iterator
 * or unexpected octet is hit, utf8_error exception is thrown. If exceptions are
 * disabled (either via -fno-exceptions or #define UTF8_IT_NOEXCEPT) - error
 * will make used iterator invalid. Dereferencing it will return replacement
 * (U+FFFD) code point.
 */
template<typename It>
class utf8_iterator {
public:
    template<typename Seq>
    utf8_iterator(const Seq& s) : val(0), cur(std::begin(s)), end(std::end(s)) {
        // If we are not constructing a past-the-end iterator - decode the
        // first character.
        if (this->cur != this->end) {
            decode_cur();
        }
    }
    utf8_iterator(It begin, It end) : val(0), cur(begin), end(end) {
        if (this->cur != this->end) {
            decode_cur();
        }
    }
    utf8_iterator(const utf8_iterator&) = default;
    utf8_iterator(utf8_iterator&&) = default;
    utf8_iterator& operator=(const utf8_iterator&) = default;
    utf8_iterator& operator=(utf8_iterator&&) = default;
    ~utf8_iterator() = default;

    using value_type = typename std::iterator_traits<It>::value_type;
    using difference_type = typename std::iterator_traits<It>::difference_type;
    using reference = const typename std::iterator_traits<It>::reference;
    using pointer = const typename std::iterator_traits<It>::pointer;
    using iterator_category = std::forward_iterator_tag;

    char32_t operator*() const {
        return this->val;
    }

    utf8_iterator& operator++() {
        this->val = 0xFFFD;
        this->cur++;
        // This is now past-the-end iterator, don't decode.
        if (this->cur == this->end) {
            return *this;
        }
        decode_cur();
        return *this;
    }

    utf8_iterator operator++(int) {
        auto copy = *this;
        ++(*this);
        return copy;
    }

    bool operator==(const utf8_iterator& other) const {
        return this->cur == other.cur;
    }
    bool operator!=(const utf8_iterator& other) const {
        return this->cur != other.cur;
    }

    It& cur_wrapped() {
        return this->cur;
    }

    It cur, end;
private:
    void decode_cur() {
        auto octet = *this->cur;
        int extra_octets = 0;

        // One could use <= here to check prefix, but it needs reinterpret_case
        // trickery to avoid getting confused by negative signed char values.
        // E.g. '\xD0' (208 or -46) < '\x7F' (127)
        if ((octet & 0b10000000) == 0) {
            this->val = octet;
        } else if ((octet & 0b11000000) == 0b10000000) {
            utf8_it_error("unexpected continuation octet");
            return;
        } else if ((octet & 0b11100000) == 0b11000000) {
            this->val = octet & 0b00011111;
            extra_octets = 1;
        } else if ((octet & 0b11110000) == 0b11100000) {
            this->val = octet & 0b00001111;
            extra_octets = 2;
        } else if ((octet & 0b11110000) == 0b11110000) {
            this->val = octet & 0b00000111;
            extra_octets = 3;
        }

        // Step to the next byte so it will be decoded next.
        ++this->cur;

        // Consume continuation octets if necessary.
        while (extra_octets > 0) {
            if (this->cur == this->end) {
                utf8_it_error("unexpected end of sequence");
            }

            // Check whether it is a valid continuation octet.
            octet = *this->cur;
            if ((octet & uint8_t(0b10000000)) != uint8_t(0b10000000)) {
                utf8_it_error("invalid continuation octet");
            }

            // 'Append' last 6 bits from continuation octet to the code point value.
            this->val <<= 6;
            this->val |= (octet & uint8_t(0b00111111));

            // Make sure at the end iterator points to the latest octet of character.
            // This allows us to compare iterators with each other and also
            // with past-the-end iterator correctly.
            --extra_octets;
            if (extra_octets != 0) {
                ++this->cur;
            }
        }
    }
    char32_t val;
};

#if __cplusplus >= 201703L
template<typename T>
utf8_iterator(const T&) -> utf8_iterator<typename T::iterator>;
#endif

/*
 * Wrapper for convenient use of utf8_iterator in for-each loops.
 *
 * ```
 * for (auto ch : utf8_runes("test")) {
 *     std::cout << ch;
 * }
 * ```
 */
template<typename It>
struct utf8_runes {
    template<typename T>
    utf8_runes(const T& t) : _begin(std::begin(t), std::end(t)), _end(std::end(t), std::end(t)) {}
    utf8_runes(It begin, It end) : _begin(begin), _end(end) {}

    utf8_iterator<It> begin() {
        return _begin;
    }
    utf8_iterator<It> end() {
        return _end;
    }
private:
    utf8_iterator<It> _begin, _end;
};

#if __cplusplus >= 201703L
template<typename T>
utf8_runes(const T&) -> utf8_runes<typename T::iterator>;
#endif
	#pragma once

	#include <stdexcept>
	#include <iterator>

	struct utf8_error : public std::runtime_error {
	utf8_error(const char* v) : std::runtime_error(v) {}
	};

	#if defined(__cpp_exceptions) && !defined(UTF8_IT_NOEXCEPT)
	# define utf8_it_error(msg) throw utf8_error(msg)
	#else
	# define utf8_it_error(msg) return;
	#endif

	/*
	* STL-compatible iterator for Unicode code points in UTF-8 encoded strings.
	*
	* Can be used to convert turn UTF-8 into UTF-32 and to process separate code
	* points instead of entire string. It does not provide any functionality for
	* the latter, though.
	*
	* A utf8_iterator instance refers to the pair of begin/end iterators of the
	* underlying container. Underlying iterators should satisfy the InputIterator
	* concept.
	*
	* operator++ decodes the code point in the stream, possibly incrementing
	* the underlying iterator multiple times. If it hits the past-the-end iterator
	* or unexpected octet is hit, utf8_error exception is thrown. If exceptions are
	* disabled (either via -fno-exceptions or #define UTF8_IT_NOEXCEPT) - error
	* will make used iterator invalid. Dereferencing it will return replacement
	* (U+FFFD) code point.
	*/
	template<typename It>
	class utf8_iterator {
	public:
	template<typename Seq>
	utf8_iterator(const Seq& s) : val(0), cur(std::begin(s)), end(std::end(s)) {
	// If we are not constructing a past-the-end iterator - decode the
	// first character.
	if (this->cur != this->end) {
	decode_cur();
	}
	}
	utf8_iterator(It begin, It end) : val(0), cur(begin), end(end) {
	if (this->cur != this->end) {
	decode_cur();
	}
	}
	utf8_iterator(const utf8_iterator&) = default;
	utf8_iterator(utf8_iterator&&) = default;
	utf8_iterator& operator=(const utf8_iterator&) = default;
	utf8_iterator& operator=(utf8_iterator&&) = default;
	~utf8_iterator() = default;

	using value_type = typename std::iterator_traits<It>::value_type;
	using difference_type = typename std::iterator_traits<It>::difference_type;
	using reference = const typename std::iterator_traits<It>::reference;
	using pointer = const typename std::iterator_traits<It>::pointer;
	using iterator_category = std::forward_iterator_tag;

	char32_t operator*() const {
	return this->val;
	}

	utf8_iterator& operator++() {
	this->val = 0xFFFD;
	this->cur++;
	// This is now past-the-end iterator, don't decode.
	if (this->cur == this->end) {
	return *this;
	}
	decode_cur();
	return *this;
	}

	utf8_iterator operator++(int) {
	auto copy = *this;
	++(*this);
	return copy;
	}

	bool operator==(const utf8_iterator& other) const {
	return this->cur == other.cur;
	}
	bool operator!=(const utf8_iterator& other) const {
	return this->cur != other.cur;
	}

	It& cur_wrapped() {
	return this->cur;
	}

	It cur, end;
	private:
	void decode_cur() {
	auto octet = *this->cur;
	int extra_octets = 0;

	// One could use <= here to check prefix, but it needs reinterpret_case
	// trickery to avoid getting confused by negative signed char values.
	// E.g. '\xD0' (208 or -46) < '\x7F' (127)
	if ((octet & 0b10000000) == 0) {
	this->val = octet;
	} else if ((octet & 0b11000000) == 0b10000000) {
	utf8_it_error("unexpected continuation octet");
	return;
	} else if ((octet & 0b11100000) == 0b11000000) {
	this->val = octet & 0b00011111;
	extra_octets = 1;
	} else if ((octet & 0b11110000) == 0b11100000) {
	this->val = octet & 0b00001111;
	extra_octets = 2;
	} else if ((octet & 0b11110000) == 0b11110000) {
	this->val = octet & 0b00000111;
	extra_octets = 3;
	}

	// Step to the next byte so it will be decoded next.
	++this->cur;

	// Consume continuation octets if necessary.
	while (extra_octets > 0) {
	if (this->cur == this->end) {
	utf8_it_error("unexpected end of sequence");
	}

	// Check whether it is a valid continuation octet.
	octet = *this->cur;
	if ((octet & uint8_t(0b10000000)) != uint8_t(0b10000000)) {
	utf8_it_error("invalid continuation octet");
	}

	// 'Append' last 6 bits from continuation octet to the code point value.
	this->val <<= 6;
	this->val \|= (octet & uint8_t(0b00111111));

	// Make sure at the end iterator points to the latest octet of character.
	// This allows us to compare iterators with each other and also
	// with past-the-end iterator correctly.
	--extra_octets;
	if (extra_octets != 0) {
	++this->cur;
	}
	}
	}
	char32_t val;
	};

	#if __cplusplus >= 201703L
	template<typename T>
	utf8_iterator(const T&) -> utf8_iterator<typename T::iterator>;
	#endif

	/*
	* Wrapper for convenient use of utf8_iterator in for-each loops.
	*
	* ```
	* for (auto ch : utf8_runes("test")) {
	* std::cout << ch;
	* }
	* ```
	*/
	template<typename It>
	struct utf8_runes {
	template<typename T>
	utf8_runes(const T& t) : _begin(std::begin(t), std::end(t)), _end(std::end(t), std::end(t)) {}
	utf8_runes(It begin, It end) : _begin(begin), _end(end) {}

	utf8_iterator<It> begin() {
	return _begin;
	}
	utf8_iterator<It> end() {
	return _end;
	}
	private:
	utf8_iterator<It> _begin, _end;
	};

	#if __cplusplus >= 201703L
	template<typename T>
	utf8_runes(const T&) -> utf8_runes<typename T::iterator>;
	#endif