Skip to content

Instantly share code, notes, and snippets.

@ufcpp
Created January 30, 2020 06:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ufcpp/19c658b65439924e287f0d7f1eb6227b to your computer and use it in GitHub Desktop.
Save ufcpp/19c658b65439924e287f0d7f1eb6227b to your computer and use it in GitHub Desktop.
ill-formed な文字
using System;
using System.Runtime.InteropServices;
using System.Text;
class Program
{
static void Main()
{
var s = "😀";
//codepoint: 1F600
//UTF-16 high: D83D, low: DE00
//UTF-8: { F0 9F 98 80 }
//UTF-16 → WTF-8 hight: { ED A0 BD }, low: { ED B8 80 }
var cp = MemoryMarshal.Cast<byte, int>(Encoding.UTF32.GetBytes(s))[0];
Console.WriteLine($"codepoint: {cp:X}");
var hs = (int)s[0];
var ls = (int)s[1];
Console.WriteLine($"UTF-16 high: {hs:X}, low: {ls:X}");
Span<byte> buffer = stackalloc byte[4];
var utf8 = ToIllFormedUtf8(cp, buffer);
Console.Write("UTF-8: {");
foreach (var b in utf8) Console.Write($" {b:X2}");
Console.WriteLine(" }");
var wtf8 = ToIllFormedUtf8(hs, buffer);
Console.Write("UTF-16 → WTF-8 hight: {");
foreach (var b in wtf8) Console.Write($" {b:X2}");
wtf8 = ToIllFormedUtf8(ls, buffer);
Console.Write(" }, low: {");
foreach (var b in wtf8) Console.Write($" {b:X2}");
Console.WriteLine(" }");
}
/// <summary>
/// 今の Encoding.UTF8.GetBytes は不正な符号点文字が来たとき U+FFFD への置き換えをする。
/// そういうのやらずに愚直に符号化。
/// </summary>
private static Span<byte> ToIllFormedUtf8(int c, Span<byte> buffer)
{
const int Two = 0b1100_0000;
const int Three = 0b1110_0000;
const int Four = 0b1111_0000;
const int Tailing = 0b1000_0000;
const int Bits6 = 0b11_1111;
const int Bits5 = 0b1_1111;
const int Bits4 = 0b1111;
const int Bits3 = 0b111;
if (c < 0x100)
{
buffer[0] = (byte)c;
return buffer.Slice(0, 1);
}
else if (c < 0x8000)
{
buffer[0] = (byte)(((c >> 6) & Bits5) | Two);
buffer[2] = (byte)((c & Bits6) | Tailing);
return buffer.Slice(0, 2);
}
else if(c < 0x10000)
{
buffer[0] = (byte)(((c >> 12) & Bits4) | Three);
buffer[1] = (byte)(((c >> 6) & Bits6) | Tailing);
buffer[2] = (byte)((c & Bits6) | Tailing);
return buffer.Slice(0, 3);
}
else
{
buffer[0] = (byte)(((c >> 18) & Bits3) | Four);
buffer[1] = (byte)(((c >> 12) & Bits6) | Tailing);
buffer[2] = (byte)(((c >> 6) & Bits6) | Tailing);
buffer[3] = (byte)((c & Bits6) | Tailing);
return buffer.Slice(0, 4);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment