Skip to content

Instantly share code, notes, and snippets.

@m-ou-se
Last active April 27, 2023 22:24
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save m-ou-se/9897607 to your computer and use it in GitHub Desktop.
Save m-ou-se/9897607 to your computer and use it in GitHub Desktop.
Utf-8/16 characters in C++
#include <array>
#include <iostream>
#include <stdexcept>
#include <string>
#include <cstdint>
/// An UTF-8 encoded character.
class utf8_char {
private:
std::array<char, 4> bytes = {{}};
public:
/// Construct an invalid UTF-8 sequence.
/**
* This can be used as 'EOF' marker, since it doesn't represent any character.
*/
constexpr utf8_char() : bytes{{'\xFF', '\xFF', '\xFF', '\xFF'}} {}
utf8_char(char32_t);
constexpr utf8_char(char a, char b = 0, char c = 0, char d = 0) : bytes{{a,b,c,d}} {}
constexpr explicit utf8_char(char a[4]) : bytes{{a[0],a[1],a[2],a[3]}} {}
char const * data() const { return bytes.data(); }
size_t size() const { return size(bytes[0]); }
/// Given the first byte of a UTF-8 sequence, determine the length of the entire sequence.
/**
* \note This function will throw a std::domain_error if the given first byte
* can't be a valid start of a UTF-8 sequence.
*/
static constexpr size_t size(char first_byte) {
return (first_byte & 0x80) == 0x00 ? 1 :
(first_byte & 0xE0) == 0xC0 ? 2 :
(first_byte & 0xF0) == 0xE0 ? 3 :
(first_byte & 0xF8) == 0xF0 ? 4 :
throw std::domain_error{"Invalid utf8 multi-byte sequence."};
}
bool operator == (utf8_char o) const { return bytes == o.bytes; }
bool operator != (utf8_char o) const { return bytes != o.bytes; }
bool operator < (utf8_char o) const { return bytes < o.bytes; }
bool operator > (utf8_char o) const { return bytes > o.bytes; }
bool operator <= (utf8_char o) const { return bytes <= o.bytes; }
bool operator >= (utf8_char o) const { return bytes >= o.bytes; }
explicit operator bool () const { return *this != utf8_char{}; }
/// Get a UTF-32 version of the character.
char32_t char32() const { return code_point(); }
/// The unicode code point encoded by this UTF-8 sequence.
uint32_t code_point() const;
friend std::ostream & operator << (std::ostream &, utf8_char);
friend std::string & operator += (std::string & out, utf8_char c);
explicit operator std::string() const {
std::string s;
s += *this;
return s;
}
};
/// An UTF-16 encoded character.
class utf16_char {
private:
std::array<char16_t, 2> ints = {{}};
public:
/// Construct a invalid UTF-16 sequence.
/**
* This can be used as 'EOF' marker, since it doesn't represent any character.
*/
constexpr utf16_char() : ints{{u'\xFFFF', u'\xFFFF'}} {}
utf16_char(char32_t);
utf16_char(utf8_char c) : utf16_char(c.char32()) {}
constexpr utf16_char(char16_t a, char16_t b = 0) : ints{{a,b}} {}
constexpr explicit utf16_char(char16_t a[2]) : ints{{a[0],a[1]}} {}
char16_t const * data() const { return ints.data(); }
size_t size() const { return ints[1] ? 2 : 1; }
/// Given the first integer of a UTF-16 sequence, determine the length of the entire sequence.
static constexpr size_t size(char16_t first_int) {
return (first_int & 0xFC00) == 0xD800 ? 2 : 1;
}
bool operator == (utf16_char o) const { return ints == o.ints; }
bool operator != (utf16_char o) const { return ints != o.ints; }
bool operator < (utf16_char o) const { return ints < o.ints; }
bool operator > (utf16_char o) const { return ints > o.ints; }
bool operator <= (utf16_char o) const { return ints <= o.ints; }
bool operator >= (utf16_char o) const { return ints >= o.ints; }
explicit operator bool () const { return *this != utf16_char{}; }
/// Get a UTF-32 version of the character.
char32_t char32() const { return code_point(); }
/// The unicode code point encoded by this UTF-16 sequence.
uint32_t code_point() const;
friend std::basic_string<char16_t> & operator += (std::basic_string<char16_t> & out, utf16_char c);
explicit operator std::basic_string<char16_t>() const {
std::basic_string<char16_t> s;
s += *this;
return s;
}
};
#include <algorithm>
#include "utf.hpp"
utf8_char::utf8_char(char32_t code_point) {
if (code_point < 0x80) {
bytes[0] = code_point;
bytes[1] = 0;
bytes[2] = 0;
bytes[3] = 0;
} else if (code_point < 0x800) {
bytes[0] = 0xC0 | (code_point >> 6);
bytes[1] = 0x80 | (code_point & 0x3F);
bytes[2] = 0;
bytes[3] = 0;
} else if (code_point < 0x10000) {
bytes[0] = 0xE0 | (code_point >> 12);
bytes[1] = 0x80 | (code_point >> 6 & 0x3F);
bytes[2] = 0x80 | (code_point & 0x3F);
bytes[3] = 0;
} else if (code_point < 0x110000) {
bytes[0] = 0xF0 | (code_point >> 18);
bytes[1] = 0x80 | (code_point >> 12 & 0x3F);
bytes[2] = 0x80 | (code_point >> 6 & 0x3F);
bytes[3] = 0x80 | (code_point & 0x3F);
} else {
throw std::domain_error{"Not a valid unicode code point."};
}
}
uint32_t utf8_char::code_point() const {
size_t n = size();
if (n == 1) return bytes[0];
if (n == 2) {
return ((bytes[0] & 0x1F) << 6) |
( bytes[1] & 0x3F );
}
if (n == 3) {
return ((bytes[0] & 0x1F) << 12) |
((bytes[1] & 0x3F) << 6) |
( bytes[2] & 0x3F );
}
if (n == 4) {
return ((bytes[0] & 0x0F) << 18) |
((bytes[1] & 0x3F) << 12) |
((bytes[2] & 0x3F) << 6) |
( bytes[3] & 0x3F );
}
throw std::logic_error{"Invalid utf8 multi-byte sequence."};
}
std::ostream & operator << (std::ostream & out, utf8_char c) {
for (size_t i = 0; i < c.size(); ++i) out.put(c.data()[i]);
return out;
}
std::string & operator += (std::string & out, utf8_char c) {
for (size_t i = 0; i < c.size(); ++i) out.push_back(c.data()[i]);
return out;
}
utf16_char::utf16_char(char32_t c) {
if (c < 0x10000) {
ints[0] = c;
ints[1] = 0;
} else {
c -= 0x10000;
ints[0] = 0xD800 | (c >> 10);
ints[1] = 0xDC00 | (c & 0x03FF);
}
}
uint32_t utf16_char::code_point() const {
if (size() == 1) return ints[0];
else return 0x10000 + ((ints[0] & 0x03FF) << 10 | (ints[1] & 0x03FF));
}
std::basic_string<char16_t> & operator += (std::basic_string<char16_t> & out, utf16_char c) {
for (size_t i = 0; i < c.size(); ++i) out.push_back(c.data()[i]);
return out;
}
@amsitlab
Copy link

amsitlab commented Jan 1, 2019

Greet and Readable code, btw how to use this library? Can you give me example usage?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment