Skip to content

Instantly share code, notes, and snippets.

@tslater2006
Created March 31, 2023 14:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tslater2006/1e8a304976ae3267f7a72ebd221ff43a to your computer and use it in GitHub Desktop.
Save tslater2006/1e8a304976ae3267f7a72ebd221ff43a to your computer and use it in GitHub Desktop.
Converts from Unicode codepoint to UTF 8 and vice versa. Courtesy of GPT-4
using System.Text;
static byte[] UnicodeCodepointToUtf8(int codepoint)
{
if (codepoint >= 0 && codepoint <= 127)
{
return new byte[] { (byte)codepoint };
}
else if (codepoint >= 128 && codepoint <= 2047)
{
byte firstByte = (byte)(192 + (codepoint / 64));
byte secondByte = (byte)(128 + (codepoint % 64));
return new byte[] { firstByte, secondByte };
}
else if (codepoint >= 2048 && codepoint <= 65535)
{
byte firstByte = (byte)(224 + (codepoint / 4096));
int remainder = codepoint % 4096;
byte secondByte = (byte)(128 + (remainder / 64));
byte thirdByte = (byte)(128 + (remainder % 64));
return new byte[] { firstByte, secondByte, thirdByte };
}
else if (codepoint >= 65536 && codepoint <= 1114111)
{
byte firstByte = (byte)(240 + (codepoint / 262144));
int remainder = codepoint % 262144;
byte secondByte = (byte)(128 + (remainder / 4096));
remainder = remainder % 4096;
byte thirdByte = (byte)(128 + (remainder / 64));
byte fourthByte = (byte)(128 + (remainder % 64));
return new byte[] { firstByte, secondByte, thirdByte, fourthByte };
}
else
{
throw new ArgumentOutOfRangeException("Invalid Unicode codepoint");
}
}
static uint Utf8BytesToCodepoint(MemoryStream ms)
{
int firstByte = ms.ReadByte();
if (firstByte < 0) return 0; // End of stream
uint codepoint = 0;
int additionalBytes = 0;
if (firstByte < 128)
{
codepoint = (uint)firstByte;
}
else if (firstByte >= 192 && firstByte < 224)
{
codepoint = (uint)(firstByte - 192);
additionalBytes = 1;
}
else if (firstByte >= 224 && firstByte < 240)
{
codepoint = (uint)(firstByte - 224);
additionalBytes = 2;
}
else if (firstByte >= 240 && firstByte < 248)
{
codepoint = (uint)(firstByte - 240);
additionalBytes = 3;
}
else
{
throw new ArgumentException("Invalid UTF-8 byte sequence.");
}
for (int i = 0; i < additionalBytes; i++)
{
int nextByte = ms.ReadByte();
if (nextByte < 128 || nextByte >= 192)
{
throw new ArgumentException("Invalid UTF-8 byte sequence.");
}
codepoint = codepoint * 64 + (uint)(nextByte - 128);
}
return codepoint;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment