Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save GlaireDaggers/0d37e3b624456a7ed2a958478801d37c to your computer and use it in GitHub Desktop.
Save GlaireDaggers/0d37e3b624456a7ed2a958478801d37c to your computer and use it in GitHub Desktop.
C# algorithm for encoding a UTF32 codepoint to UTF8 bytes
private const int SURROGATE_LOW_BITS = 0x7FF;
private const int MAX_SURROGATE = 0xDFFF;
private const int MAX_FOUR_BYTE = 0x10FFFF;
private const int ONE_BYTE_BITS = 7;
private const int TWO_BYTE_BITS = 11;
private const int TWO_BYTE_PREFIX = 0xC0;
private const int THREE_BYTE_BITS = 16;
private const int THREE_BYTE_PREFIX = 0xE0;
private const int FOUR_BYTE_PREFIX = 0xF0;
private const int CONTINUATION_BYTE = 0x80;
private const int CONTINUATION_MASK = 0x3F;
private void EncodeUTF8(uint codepoint, AppendBuffer<byte> buffer)
{
if ((codepoint | SURROGATE_LOW_BITS) == MAX_SURROGATE || codepoint > MAX_FOUR_BYTE)
{
throw new ArgumentOutOfRangeException(nameof(codepoint));
}
int bytes_written = 0;
if ((codepoint >> ONE_BYTE_BITS) == 0)
{
buffer.Append((byte)codepoint);
bytes_written = 1;
}
else if ((codepoint >> TWO_BYTE_BITS) == 0)
{
buffer.Append((byte)(TWO_BYTE_PREFIX | (codepoint >> 6)));
bytes_written = 2;
}
else if ((codepoint >> THREE_BYTE_BITS) == 0)
{
buffer.Append((byte)(THREE_BYTE_PREFIX | (codepoint >> 12)));
bytes_written = 3;
}
else
{
buffer.Append((byte)(FOUR_BYTE_PREFIX | (codepoint >> 18)));
bytes_written = 4;
}
switch (bytes_written)
{
case 4: buffer.Append((byte)(CONTINUATION_BYTE | ((codepoint >> 12) & CONTINUATION_MASK))); goto case 3;
case 3: buffer.Append((byte)(CONTINUATION_BYTE | ((codepoint >> 6) & CONTINUATION_MASK))); goto case 2;
case 2: buffer.Append((byte)(CONTINUATION_BYTE | (codepoint & CONTINUATION_MASK))); goto default;
default: return;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment