Skip to content

Instantly share code, notes, and snippets.

@cpurdy
Created April 11, 2023 02:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cpurdy/ee092bb7ed229470f5e91502af1a019e to your computer and use it in GitHub Desktop.
Save cpurdy/ee092bb7ed229470f5e91502af1a019e to your computer and use it in GitHub Desktop.
/**
* Read a sequence of bytes from the stream corresponding to a single Unicode character that is
* encoded in the UTF-8 format. If the character is in the surrogate range, the second codepoint
* in the pair will also be read, and joined with the first.
*
* @param in the BinaryInput stream
*
* @return the character read from the stream in UTF-8 format
*
* @throws IllegalUTF if there is a flaw in the UTF-8 encoding or in the resulting codepoint
*/
static Char readUTF8Char(BinaryInput in)
{
private UInt32 trailing(BinaryInput in)
{
Byte b = in.readByte();
if (b & 0b11000000 != 0b10000000)
{
throw new IllegalUTF("trailing unicode byte does not match 10xxxxxx");
}
return (b & 0b00111111).toUInt32();
}
// otherwise the format is based on the number of high-order 1-bits:
// #1s first byte trailing # trailing bits code-points
// --- ---------- -------- ---------- ---- -----------------------
// 0 0xxxxxxx n/a 0 7 U+0000 - U+007F (ASCII)
// 2 110xxxxx 10xxxxxx 1 11 U+0080 - U+07FF
// 3 1110xxxx 10xxxxxx 2 16 U+0800 - U+FFFF
// 4 11110xxx 10xxxxxx 3 21 U+10000 - U+1FFFFF
// 5 111110xx 10xxxxxx 4 26 U+200000 - U+3FFFFFF
// 6 1111110x 10xxxxxx 5 31 U+4000000 - U+7FFFFFFF
Byte b = in.readByte();
UInt32 n = b.toUInt32();
switch ((~b).leftmostBit)
{
case 0b10000000:
return n.toChar();
case 0b00100000:
return (n & 0b00011111 << 6 | trailing(in)).toChar();
case 0b00010000:
n = n & 0b00001111 << 6
| trailing(in) << 6
| trailing(in);
break;
case 0b00001000:
n = n & 0b00000111 << 6
| trailing(in) << 6
| trailing(in) << 6
| trailing(in);
break;
case 0b00000100:
n = n & 0b00000011 << 6
| trailing(in) << 6
| trailing(in) << 6
| trailing(in) << 6
| trailing(in);
break;
case 0b00000010:
n = n & 0b00000001 << 6
| trailing(in) << 6
| trailing(in) << 6
| trailing(in) << 6
| trailing(in) << 6
| trailing(in);
break;
default:
throw new IllegalUTF($"initial byte: {b}");
}
Char ch = n.toChar();
return ch.requiresTrailingSurrogate()
? ch.addTrailingSurrogate(readUTF8Char(in))
: ch;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment