wcho21/.gitignore

## .gitignore
*.out

## README.md

      
    Raw
  

              README.md
            
          
    Implementing UTF encodings

To compile, run make.

  
## makefile
.PHONY: all
all: utf-8.out utf-16.out utf-32.out

%.out: %.cpp
	g++ -std=c++2a -Wall -Wextra -o $@ $<

.PHONY: run
run:
	./utf-8.out
	./utf-16.out
	./utf-32.out

## utf-16.cpp
#include <algorithm>
#include <cstdint>
#include <cassert>

#ifdef _WIN64
#include <windows.h> // for Windows
#endif

using std::begin, std::end, std::equal;

typedef uint32_t Codepoint;

void encodeUtf16(const Codepoint cp, char16_t *buf) {
  bool within_16bits = cp < (1 << 16);
  if (within_16bits) {
    buf[0] = cp;
    buf[1] = '\0';

    return;
  }

  bool within_21bits = cp < (1 << 21);
  if (within_21bits) {
    // encode x xxxx yyyy yyzz zzzz zzzz
    // into bytes 1101 10ww wwyy yyyy 1101 11zz zzzz zzzz
    // with wwww = xxxxx - 1

    Codepoint x = (cp & 0b1'1111'0000'0000'0000'0000) >> 16;
    Codepoint y = (cp & 0b0'0000'1111'1100'0000'0000) >> 10;
    Codepoint z = (cp & 0b0'0000'0000'0011'1111'1111);
    Codepoint w = x - 1;

    buf[0] = 0b1101'1000'0000'0000 | (w << 6) | y;
    buf[1] = 0b1101'1100'0000'0000 | z;
    buf[2] = '\0';

    return;
  }

  // return U+FFFD in UTF-16
  buf[0] = 0xFFFD;
  buf[1] = '\0';
}

Codepoint decodeUtf16(const char16_t *buf) {
  if (((buf[0] & 0b1111'1100'0000'0000) >> 10) == 0b11'0110 &&
      ((buf[1] & 0b1111'1100'0000'0000) >> 10) == 0b11'0111) {
    // get w wwww yyyy yyzz zzzz zzzz
    // from bytes 1101 10xx xxyy yyyy 1101 11zz zzzz zzzz
    // with wwwww = xxxx + 1

    Codepoint x = (buf[0] & 0b11'1100'0000) >> 6;
    Codepoint y = (buf[0] & 0b00'0011'1111);
    Codepoint z = (buf[1] & 0b11'1111'1111);
    Codepoint w = x + 1;

    Codepoint cp = (w << 16) + (y << 10) + z;
    return cp;
  }

  if (((buf[0] & 0b1111'1100'0000'0000) >> 10) != 0b11'0110 &&
      ((buf[0] & 0b1111'1100'0000'0000) >> 10) != 0b11'0111) {
    Codepoint cp = buf[0];
    return cp;
  }

  return 0xFFFD;
}

// test code
int main() {
  #ifdef _WIN64
  SetConsoleOutputCP(CP_UTF8); // for Windows
  #endif

  char16_t utf16_buffer[3];

  encodeUtf16(0x41, utf16_buffer);
  assert(equal(utf16_buffer, utf16_buffer+1, u"A"));
  assert(decodeUtf16(utf16_buffer) == 0x41);

  encodeUtf16(0xA9, utf16_buffer);
  assert(equal(utf16_buffer, utf16_buffer+1, u"©"));
  assert(decodeUtf16(utf16_buffer) == 0xA9);

  encodeUtf16(0xAC00, utf16_buffer);
  assert(equal(utf16_buffer, utf16_buffer+1, u"가"));
  assert(decodeUtf16(utf16_buffer) == 0xAC00);

  encodeUtf16(0x1F602, utf16_buffer);
  assert(equal(utf16_buffer, utf16_buffer+2, u"😂"));
  assert(decodeUtf16(utf16_buffer) == 0x1F602);
}

## utf-32.cpp
#include <algorithm>
#include <cstdint>
#include <cassert>

#ifdef _WIN64
#include <windows.h> // for Windows
#endif

using std::begin, std::end, std::equal;

typedef uint32_t Codepoint;

void encodeUtf32(const Codepoint cp, char32_t *buf) {
  buf[0] = cp;
  buf[1] = '\0';
}

Codepoint decodeUtf32(const char32_t *buf) {
  return buf[0];
}

// test code
int main() {
  #ifdef _WIN64
  SetConsoleOutputCP(CP_UTF8); // for Windows
  #endif

  char32_t utf32_buffer[2];

  encodeUtf32(0x41, utf32_buffer);
  assert(equal(utf32_buffer, utf32_buffer+1, u"A"));
  assert(decodeUtf32(utf32_buffer) == 0x41);

  encodeUtf32(0xA9, utf32_buffer);
  assert(equal(utf32_buffer, utf32_buffer+1, U"©"));
  assert(decodeUtf32(utf32_buffer) == 0xA9);

  encodeUtf32(0xAC00, utf32_buffer);
  assert(equal(utf32_buffer, utf32_buffer+1, U"가"));
  assert(decodeUtf32(utf32_buffer) == 0xAC00);

  encodeUtf32(0x1F602, utf32_buffer);
  assert(equal(utf32_buffer, utf32_buffer+2, U"😂"));
  assert(decodeUtf32(utf32_buffer) == 0x1F602);
}

## utf-8.cpp
#include <algorithm>
#include <cstdint>
#include <cassert>

#ifdef _WIN64
#include <windows.h> // for Windows
#endif

using std::begin, std::end, std::equal;

typedef uint32_t Codepoint;

void encodeUtf8(const Codepoint cp, char8_t *buf) {
  bool within_7bits = cp < (1 << 7);
  if (within_7bits) {
    // encode xxx xxxx
    // into bytes 0xxx xxxx

    buf[0] = cp;
    buf[1] = '\0';

    return;
  }

  bool within_11bits = cp < (1 << 11);
  if (within_11bits) {
    // encode yyy yyxx xxxx
    // into bytes 110y yyyy 10xx xxxx

    Codepoint y = (cp & 0b111'1100'0000) >> 6;
    Codepoint x = (cp & 0b000'0011'1111);

    buf[0] = 0b1100'0000 | y;
    buf[1] = 0b1000'0000 | x;
    buf[2] = '\0';

    return;
  }

  bool within_16bits = cp < (1 << 16);
  if (within_16bits) {
    // encode zzzz yyyy yyxx xxxx
    // into bytes 1110 zzzz 10yy yyyy 10xx xxxx

    Codepoint z = (cp & 0b1111'0000'0000'0000) >> 12;
    Codepoint y = (cp & 0b0000'1111'1100'0000) >>  6;
    Codepoint x = (cp & 0b0000'0000'0011'1111);

    buf[0] = 0b1110'0000 | z;
    buf[1] = 0b1000'0000 | y;
    buf[2] = 0b1000'0000 | x;
    buf[3] = '\0';

    return;
  }

  bool within_21bits = cp < (1 << 21);
  if (within_21bits) {
    // encode w wwzz zzzz yyyy yyxx xxxx
    // into bytes 1111 0www 10zz zzzz 10yy yyyy 10xx xxxx

    Codepoint w = (cp & 0b1'1100'0000'0000'0000'0000) >> 18;
    Codepoint z = (cp & 0b0'0011'1111'0000'0000'0000) >> 12;
    Codepoint y = (cp & 0b0'0000'0000'1111'1100'0000) >>  6;
    Codepoint x = (cp & 0b0'0000'0000'0000'0011'1111);

    buf[0] = 0b1111'0000 | w;
    buf[1] = 0b1000'0000 | z;
    buf[2] = 0b1000'0000 | y;
    buf[3] = 0b1000'0000 | x;
    buf[4] = '\0';

    return;
  }

  // return U+FFFD in UTF-8
  buf[0] = 0xEF;
  buf[1] = 0xBF;
  buf[2] = 0xBD;
  buf[3] = '\0';
}

Codepoint decodeUtf8(const char8_t *buf) {
  if ((buf[0] & 0b1000'0000) == 0) {
    // get xxx xxxx
    // from bytes 0xxx xxxx

    return (Codepoint) buf[0];
  }

  if (((buf[0] & 0b1110'0000) >> 5) == 0b110 &&
      ((buf[1] & 0b1100'0000) >> 6) == 0b10) {
    // get xxx xxyy yyyy
    // from bytes 110x xxxx 10yy yyyy

    Codepoint x = buf[0] &  0b1'1111;
    Codepoint y = buf[1] & 0b11'1111;

    Codepoint cp = (x << 6) | y;
    return cp;
  }

  if (((buf[0] & 0b1111'0000) >> 4) == 0b1110 &&
      ((buf[1] & 0b1100'0000) >> 6) == 0b10 &&
      ((buf[2] & 0b1100'0000) >> 6) == 0b10) {
    // get xxxx yyyy yyzz zzzz
    // from bytes 1110 xxxx 10yy yyyy 10zz zzzz

    Codepoint x = buf[0] &    0b1111;
    Codepoint y = buf[1] & 0b11'1111;
    Codepoint z = buf[2] & 0b11'1111;

    Codepoint cp = (x << 12) | (y << 6) | z;
    return cp;
  }

  if (((buf[0] & 0b1111'1000) >> 3) == 0b11110 &&
      ((buf[1] & 0b1100'0000) >> 6) == 0b10 &&
      ((buf[2] & 0b1100'0000) >> 6) == 0b10 &&
      ((buf[3] & 0b1100'0000) >> 6) == 0b10) {
    // get x xxyy yyyy zzzz zzww wwww
    // from bytes 1111 0xxx 10yy yyyy 10zz zzzz 10ww wwww

    Codepoint x = buf[0] &     0b111;
    Codepoint y = buf[1] & 0b11'1111;
    Codepoint z = buf[2] & 0b11'1111;
    Codepoint w = buf[3] & 0b11'1111;

    Codepoint cp = (x << 18) | (y << 12) | (z << 6) | w;
    return cp;
  }

  return 0xFFFD;
}

// test code
int main() {
  #ifdef _WIN64
  SetConsoleOutputCP(CP_UTF8); // for Windows
  #endif

  char8_t utf8_buffer[5];

  encodeUtf8(0x41, utf8_buffer);
  assert(equal(utf8_buffer, utf8_buffer+2, u8"A"));
  assert(decodeUtf8(utf8_buffer) == 0x41);

  encodeUtf8(0xA9, utf8_buffer);
  assert(equal(utf8_buffer, utf8_buffer+3, u8"©"));
  assert(decodeUtf8(utf8_buffer) == 0xA9);

  encodeUtf8(0xAC00, utf8_buffer);
  assert(equal(utf8_buffer, utf8_buffer+4, u8"가"));
  assert(decodeUtf8(utf8_buffer) == 0xAC00);

  encodeUtf8(0x1F602, utf8_buffer);
  assert(equal(utf8_buffer, utf8_buffer+5, u8"😂"));
  assert(decodeUtf8(utf8_buffer) == 0x1F602);
}
	.PHONY: all
	all: utf-8.out utf-16.out utf-32.out

	%.out: %.cpp
	g++ -std=c++2a -Wall -Wextra -o $@ $<

	.PHONY: run
	run:
	./utf-8.out
	./utf-16.out
	./utf-32.out
	#include <algorithm>
	#include <cstdint>
	#include <cassert>

	#ifdef _WIN64
	#include <windows.h> // for Windows
	#endif

	using std::begin, std::end, std::equal;

	typedef uint32_t Codepoint;

	void encodeUtf16(const Codepoint cp, char16_t *buf) {
	bool within_16bits = cp < (1 << 16);
	if (within_16bits) {
	buf[0] = cp;
	buf[1] = '\0';

	return;
	}

	bool within_21bits = cp < (1 << 21);
	if (within_21bits) {
	// encode x xxxx yyyy yyzz zzzz zzzz
	// into bytes 1101 10ww wwyy yyyy 1101 11zz zzzz zzzz
	// with wwww = xxxxx - 1

	Codepoint x = (cp & 0b1'1111'0000'0000'0000'0000) >> 16;
	Codepoint y = (cp & 0b0'0000'1111'1100'0000'0000) >> 10;
	Codepoint z = (cp & 0b0'0000'0000'0011'1111'1111);
	Codepoint w = x - 1;

	buf[0] = 0b1101'1000'0000'0000 \| (w << 6) \| y;
	buf[1] = 0b1101'1100'0000'0000 \| z;
	buf[2] = '\0';

	return;
	}

	// return U+FFFD in UTF-16
	buf[0] = 0xFFFD;
	buf[1] = '\0';
	}

	Codepoint decodeUtf16(const char16_t *buf) {
	if (((buf[0] & 0b1111'1100'0000'0000) >> 10) == 0b11'0110 &&
	((buf[1] & 0b1111'1100'0000'0000) >> 10) == 0b11'0111) {
	// get w wwww yyyy yyzz zzzz zzzz
	// from bytes 1101 10xx xxyy yyyy 1101 11zz zzzz zzzz
	// with wwwww = xxxx + 1

	Codepoint x = (buf[0] & 0b11'1100'0000) >> 6;
	Codepoint y = (buf[0] & 0b00'0011'1111);
	Codepoint z = (buf[1] & 0b11'1111'1111);
	Codepoint w = x + 1;

	Codepoint cp = (w << 16) + (y << 10) + z;
	return cp;
	}

	if (((buf[0] & 0b1111'1100'0000'0000) >> 10) != 0b11'0110 &&
	((buf[0] & 0b1111'1100'0000'0000) >> 10) != 0b11'0111) {
	Codepoint cp = buf[0];
	return cp;
	}

	return 0xFFFD;
	}

	// test code
	int main() {
	#ifdef _WIN64
	SetConsoleOutputCP(CP_UTF8); // for Windows
	#endif

	char16_t utf16_buffer[3];

	encodeUtf16(0x41, utf16_buffer);
	assert(equal(utf16_buffer, utf16_buffer+1, u"A"));
	assert(decodeUtf16(utf16_buffer) == 0x41);

	encodeUtf16(0xA9, utf16_buffer);
	assert(equal(utf16_buffer, utf16_buffer+1, u"©"));
	assert(decodeUtf16(utf16_buffer) == 0xA9);

	encodeUtf16(0xAC00, utf16_buffer);
	assert(equal(utf16_buffer, utf16_buffer+1, u"가"));
	assert(decodeUtf16(utf16_buffer) == 0xAC00);

	encodeUtf16(0x1F602, utf16_buffer);
	assert(equal(utf16_buffer, utf16_buffer+2, u"😂"));
	assert(decodeUtf16(utf16_buffer) == 0x1F602);
	}