m-ou-se/0 utf.hpp

## 0 utf.hpp
#include <array>
#include <iostream>
#include <stdexcept>
#include <string>
#include <cstdint>

/// An UTF-8 encoded character.
class utf8_char {

private:
	std::array<char, 4> bytes = {{}};

public:
	/// Construct an invalid UTF-8 sequence.
	/**
	 * This can be used as 'EOF' marker, since it doesn't represent any character.
	 */
	constexpr utf8_char() : bytes{{'\xFF', '\xFF', '\xFF', '\xFF'}} {}

	utf8_char(char32_t);
	constexpr utf8_char(char a, char b = 0, char c = 0, char d = 0) : bytes{{a,b,c,d}} {}
	constexpr explicit utf8_char(char a[4]) : bytes{{a[0],a[1],a[2],a[3]}} {}

	char const * data() const { return bytes.data(); }

	size_t size() const { return size(bytes[0]); }

	/// Given the first byte of a UTF-8 sequence, determine the length of the entire sequence.
	/**
	 * \note This function will throw a std::domain_error if the given first byte
	 *       can't be a valid start of a UTF-8 sequence.
	 */
	static constexpr size_t size(char first_byte) {
		return (first_byte & 0x80) == 0x00 ? 1 :
		       (first_byte & 0xE0) == 0xC0 ? 2 :
		       (first_byte & 0xF0) == 0xE0 ? 3 :
		       (first_byte & 0xF8) == 0xF0 ? 4 :
		       throw std::domain_error{"Invalid utf8 multi-byte sequence."};
	}

	bool operator == (utf8_char o) const { return bytes == o.bytes; }
	bool operator != (utf8_char o) const { return bytes != o.bytes; }

	bool operator <  (utf8_char o) const { return bytes <  o.bytes; }
	bool operator >  (utf8_char o) const { return bytes >  o.bytes; }
	bool operator <= (utf8_char o) const { return bytes <= o.bytes; }
	bool operator >= (utf8_char o) const { return bytes >= o.bytes; }

	explicit operator bool () const { return *this != utf8_char{}; }

	/// Get a UTF-32 version of the character.
	char32_t char32() const { return code_point(); }

	/// The unicode code point encoded by this UTF-8 sequence.
	uint32_t code_point() const;

	friend std::ostream & operator << (std::ostream &, utf8_char);
	friend std::string & operator += (std::string & out, utf8_char c);

	explicit operator std::string() const {
		std::string s;
		s += *this;
		return s;
	}

};

/// An UTF-16 encoded character.
class utf16_char {

private:
	std::array<char16_t, 2> ints = {{}};

public:
	/// Construct a invalid UTF-16 sequence.
	/**
	 * This can be used as 'EOF' marker, since it doesn't represent any character.
	 */
	constexpr utf16_char() : ints{{u'\xFFFF', u'\xFFFF'}} {}

	utf16_char(char32_t);
	utf16_char(utf8_char c) : utf16_char(c.char32()) {}
	constexpr utf16_char(char16_t a, char16_t b = 0) : ints{{a,b}} {}
	constexpr explicit utf16_char(char16_t a[2]) : ints{{a[0],a[1]}} {}

	char16_t const * data() const { return ints.data(); }

	size_t size() const { return ints[1] ? 2 : 1; }

	/// Given the first integer of a UTF-16 sequence, determine the length of the entire sequence.
	static constexpr size_t size(char16_t first_int) {
		return (first_int & 0xFC00) == 0xD800 ? 2 : 1;
	}

	bool operator == (utf16_char o) const { return ints == o.ints; }
	bool operator != (utf16_char o) const { return ints != o.ints; }

	bool operator <  (utf16_char o) const { return ints <  o.ints; }
	bool operator >  (utf16_char o) const { return ints >  o.ints; }
	bool operator <= (utf16_char o) const { return ints <= o.ints; }
	bool operator >= (utf16_char o) const { return ints >= o.ints; }

	explicit operator bool () const { return *this != utf16_char{}; }

	/// Get a UTF-32 version of the character.
	char32_t char32() const { return code_point(); }

	/// The unicode code point encoded by this UTF-16 sequence.
	uint32_t code_point() const;

	friend std::basic_string<char16_t> & operator += (std::basic_string<char16_t> & out, utf16_char c);

	explicit operator std::basic_string<char16_t>() const {
		std::basic_string<char16_t> s;
		s += *this;
		return s;
	}

};

## 1 utf.cpp
#include <algorithm>

#include "utf.hpp"

utf8_char::utf8_char(char32_t code_point) {
	if (code_point < 0x80) {
		bytes[0] = code_point;
		bytes[1] = 0;
		bytes[2] = 0;
		bytes[3] = 0;
	} else if (code_point < 0x800) {
		bytes[0] = 0xC0 | (code_point >> 6);
		bytes[1] = 0x80 | (code_point & 0x3F);
		bytes[2] = 0;
		bytes[3] = 0;
	} else if (code_point < 0x10000) {
		bytes[0] = 0xE0 | (code_point >> 12);
		bytes[1] = 0x80 | (code_point >> 6 & 0x3F);
		bytes[2] = 0x80 | (code_point & 0x3F);
		bytes[3] = 0;
	} else if (code_point < 0x110000) {
		bytes[0] = 0xF0 | (code_point >> 18);
		bytes[1] = 0x80 | (code_point >> 12 & 0x3F);
		bytes[2] = 0x80 | (code_point >> 6 & 0x3F);
		bytes[3] = 0x80 | (code_point & 0x3F);
	} else {
		throw std::domain_error{"Not a valid unicode code point."};
	}
}

uint32_t utf8_char::code_point() const {
	size_t n = size();
	if (n == 1) return bytes[0];
	if (n == 2) {
		return ((bytes[0] & 0x1F) << 6) |
		       ( bytes[1] & 0x3F      );
	}
	if (n == 3) {
		return ((bytes[0] & 0x1F) << 12) |
		       ((bytes[1] & 0x3F) <<  6) |
		       ( bytes[2] & 0x3F       );
	}
	if (n == 4) {
		return ((bytes[0] & 0x0F) << 18) |
		       ((bytes[1] & 0x3F) << 12) |
		       ((bytes[2] & 0x3F) <<  6) |
		       ( bytes[3] & 0x3F       );
	}
	throw std::logic_error{"Invalid utf8 multi-byte sequence."};
}

std::ostream & operator << (std::ostream & out, utf8_char c) {
	for (size_t i = 0; i < c.size(); ++i) out.put(c.data()[i]);
	return out;
}

std::string & operator += (std::string & out, utf8_char c) {
	for (size_t i = 0; i < c.size(); ++i) out.push_back(c.data()[i]);
	return out;
}

utf16_char::utf16_char(char32_t c) {
	if (c < 0x10000) {
		ints[0] = c;
		ints[1] = 0;
	} else {
		c -= 0x10000;
		ints[0] = 0xD800 | (c >> 10);
		ints[1] = 0xDC00 | (c & 0x03FF);
	}
}

uint32_t utf16_char::code_point() const {
	if (size() == 1) return ints[0];
	else return 0x10000 + ((ints[0] & 0x03FF) << 10 | (ints[1] & 0x03FF));
}

std::basic_string<char16_t> & operator += (std::basic_string<char16_t> & out, utf16_char c) {
	for (size_t i = 0; i < c.size(); ++i) out.push_back(c.data()[i]);
	return out;
}
	#include <array>
	#include <iostream>
	#include <stdexcept>
	#include <string>
	#include <cstdint>

	/// An UTF-8 encoded character.
	class utf8_char {

	private:
	std::array<char, 4> bytes = {{}};

	public:
	/// Construct an invalid UTF-8 sequence.
	/**
	* This can be used as 'EOF' marker, since it doesn't represent any character.
	*/
	constexpr utf8_char() : bytes{{'\xFF', '\xFF', '\xFF', '\xFF'}} {}

	utf8_char(char32_t);
	constexpr utf8_char(char a, char b = 0, char c = 0, char d = 0) : bytes{{a,b,c,d}} {}
	constexpr explicit utf8_char(char a[4]) : bytes{{a[0],a[1],a[2],a[3]}} {}

	char const * data() const { return bytes.data(); }

	size_t size() const { return size(bytes[0]); }

	/// Given the first byte of a UTF-8 sequence, determine the length of the entire sequence.
	/**
	* \note This function will throw a std::domain_error if the given first byte
	* can't be a valid start of a UTF-8 sequence.
	*/
	static constexpr size_t size(char first_byte) {
	return (first_byte & 0x80) == 0x00 ? 1 :
	(first_byte & 0xE0) == 0xC0 ? 2 :
	(first_byte & 0xF0) == 0xE0 ? 3 :
	(first_byte & 0xF8) == 0xF0 ? 4 :
	throw std::domain_error{"Invalid utf8 multi-byte sequence."};
	}

	bool operator == (utf8_char o) const { return bytes == o.bytes; }
	bool operator != (utf8_char o) const { return bytes != o.bytes; }

	bool operator < (utf8_char o) const { return bytes < o.bytes; }
	bool operator > (utf8_char o) const { return bytes > o.bytes; }
	bool operator <= (utf8_char o) const { return bytes <= o.bytes; }
	bool operator >= (utf8_char o) const { return bytes >= o.bytes; }

	explicit operator bool () const { return *this != utf8_char{}; }

	/// Get a UTF-32 version of the character.
	char32_t char32() const { return code_point(); }

	/// The unicode code point encoded by this UTF-8 sequence.
	uint32_t code_point() const;

	friend std::ostream & operator << (std::ostream &, utf8_char);
	friend std::string & operator += (std::string & out, utf8_char c);

	explicit operator std::string() const {
	std::string s;
	s += *this;
	return s;
	}

	};

	/// An UTF-16 encoded character.
	class utf16_char {

	private:
	std::array<char16_t, 2> ints = {{}};

	public:
	/// Construct a invalid UTF-16 sequence.
	/**
	* This can be used as 'EOF' marker, since it doesn't represent any character.
	*/
	constexpr utf16_char() : ints{{u'\xFFFF', u'\xFFFF'}} {}

	utf16_char(char32_t);
	utf16_char(utf8_char c) : utf16_char(c.char32()) {}
	constexpr utf16_char(char16_t a, char16_t b = 0) : ints{{a,b}} {}
	constexpr explicit utf16_char(char16_t a[2]) : ints{{a[0],a[1]}} {}

	char16_t const * data() const { return ints.data(); }

	size_t size() const { return ints[1] ? 2 : 1; }

	/// Given the first integer of a UTF-16 sequence, determine the length of the entire sequence.
	static constexpr size_t size(char16_t first_int) {
	return (first_int & 0xFC00) == 0xD800 ? 2 : 1;
	}

	bool operator == (utf16_char o) const { return ints == o.ints; }
	bool operator != (utf16_char o) const { return ints != o.ints; }

	bool operator < (utf16_char o) const { return ints < o.ints; }
	bool operator > (utf16_char o) const { return ints > o.ints; }
	bool operator <= (utf16_char o) const { return ints <= o.ints; }
	bool operator >= (utf16_char o) const { return ints >= o.ints; }

	explicit operator bool () const { return *this != utf16_char{}; }

	/// Get a UTF-32 version of the character.
	char32_t char32() const { return code_point(); }

	/// The unicode code point encoded by this UTF-16 sequence.
	uint32_t code_point() const;

	friend std::basic_string<char16_t> & operator += (std::basic_string<char16_t> & out, utf16_char c);

	explicit operator std::basic_string<char16_t>() const {
	std::basic_string<char16_t> s;
	s += *this;
	return s;
	}

	};
	#include <algorithm>

	#include "utf.hpp"

	utf8_char::utf8_char(char32_t code_point) {
	if (code_point < 0x80) {
	bytes[0] = code_point;
	bytes[1] = 0;
	bytes[2] = 0;
	bytes[3] = 0;
	} else if (code_point < 0x800) {
	bytes[0] = 0xC0 \| (code_point >> 6);
	bytes[1] = 0x80 \| (code_point & 0x3F);
	bytes[2] = 0;
	bytes[3] = 0;
	} else if (code_point < 0x10000) {
	bytes[0] = 0xE0 \| (code_point >> 12);
	bytes[1] = 0x80 \| (code_point >> 6 & 0x3F);
	bytes[2] = 0x80 \| (code_point & 0x3F);
	bytes[3] = 0;
	} else if (code_point < 0x110000) {
	bytes[0] = 0xF0 \| (code_point >> 18);
	bytes[1] = 0x80 \| (code_point >> 12 & 0x3F);
	bytes[2] = 0x80 \| (code_point >> 6 & 0x3F);
	bytes[3] = 0x80 \| (code_point & 0x3F);
	} else {
	throw std::domain_error{"Not a valid unicode code point."};
	}
	}

	uint32_t utf8_char::code_point() const {
	size_t n = size();
	if (n == 1) return bytes[0];
	if (n == 2) {
	return ((bytes[0] & 0x1F) << 6) \|
	( bytes[1] & 0x3F );
	}
	if (n == 3) {
	return ((bytes[0] & 0x1F) << 12) \|
	((bytes[1] & 0x3F) << 6) \|
	( bytes[2] & 0x3F );
	}
	if (n == 4) {
	return ((bytes[0] & 0x0F) << 18) \|
	((bytes[1] & 0x3F) << 12) \|
	((bytes[2] & 0x3F) << 6) \|
	( bytes[3] & 0x3F );
	}
	throw std::logic_error{"Invalid utf8 multi-byte sequence."};
	}

	std::ostream & operator << (std::ostream & out, utf8_char c) {
	for (size_t i = 0; i < c.size(); ++i) out.put(c.data()[i]);
	return out;
	}

	std::string & operator += (std::string & out, utf8_char c) {
	for (size_t i = 0; i < c.size(); ++i) out.push_back(c.data()[i]);
	return out;
	}

	utf16_char::utf16_char(char32_t c) {
	if (c < 0x10000) {
	ints[0] = c;
	ints[1] = 0;
	} else {
	c -= 0x10000;
	ints[0] = 0xD800 \| (c >> 10);
	ints[1] = 0xDC00 \| (c & 0x03FF);
	}
	}

	uint32_t utf16_char::code_point() const {
	if (size() == 1) return ints[0];
	else return 0x10000 + ((ints[0] & 0x03FF) << 10 \| (ints[1] & 0x03FF));
	}

	std::basic_string<char16_t> & operator += (std::basic_string<char16_t> & out, utf16_char c) {
	for (size_t i = 0; i < c.size(); ++i) out.push_back(c.data()[i]);
	return out;
	}