Created
January 30, 2020 06:08
-
-
Save ufcpp/19c658b65439924e287f0d7f1eb6227b to your computer and use it in GitHub Desktop.
ill-formed な文字
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Runtime.InteropServices; | |
using System.Text; | |
class Program | |
{ | |
static void Main() | |
{ | |
var s = "😀"; | |
//codepoint: 1F600 | |
//UTF-16 high: D83D, low: DE00 | |
//UTF-8: { F0 9F 98 80 } | |
//UTF-16 → WTF-8 hight: { ED A0 BD }, low: { ED B8 80 } | |
var cp = MemoryMarshal.Cast<byte, int>(Encoding.UTF32.GetBytes(s))[0]; | |
Console.WriteLine($"codepoint: {cp:X}"); | |
var hs = (int)s[0]; | |
var ls = (int)s[1]; | |
Console.WriteLine($"UTF-16 high: {hs:X}, low: {ls:X}"); | |
Span<byte> buffer = stackalloc byte[4]; | |
var utf8 = ToIllFormedUtf8(cp, buffer); | |
Console.Write("UTF-8: {"); | |
foreach (var b in utf8) Console.Write($" {b:X2}"); | |
Console.WriteLine(" }"); | |
var wtf8 = ToIllFormedUtf8(hs, buffer); | |
Console.Write("UTF-16 → WTF-8 hight: {"); | |
foreach (var b in wtf8) Console.Write($" {b:X2}"); | |
wtf8 = ToIllFormedUtf8(ls, buffer); | |
Console.Write(" }, low: {"); | |
foreach (var b in wtf8) Console.Write($" {b:X2}"); | |
Console.WriteLine(" }"); | |
} | |
/// <summary> | |
/// 今の Encoding.UTF8.GetBytes は不正な符号点文字が来たとき U+FFFD への置き換えをする。 | |
/// そういうのやらずに愚直に符号化。 | |
/// </summary> | |
private static Span<byte> ToIllFormedUtf8(int c, Span<byte> buffer) | |
{ | |
const int Two = 0b1100_0000; | |
const int Three = 0b1110_0000; | |
const int Four = 0b1111_0000; | |
const int Tailing = 0b1000_0000; | |
const int Bits6 = 0b11_1111; | |
const int Bits5 = 0b1_1111; | |
const int Bits4 = 0b1111; | |
const int Bits3 = 0b111; | |
if (c < 0x100) | |
{ | |
buffer[0] = (byte)c; | |
return buffer.Slice(0, 1); | |
} | |
else if (c < 0x8000) | |
{ | |
buffer[0] = (byte)(((c >> 6) & Bits5) | Two); | |
buffer[2] = (byte)((c & Bits6) | Tailing); | |
return buffer.Slice(0, 2); | |
} | |
else if(c < 0x10000) | |
{ | |
buffer[0] = (byte)(((c >> 12) & Bits4) | Three); | |
buffer[1] = (byte)(((c >> 6) & Bits6) | Tailing); | |
buffer[2] = (byte)((c & Bits6) | Tailing); | |
return buffer.Slice(0, 3); | |
} | |
else | |
{ | |
buffer[0] = (byte)(((c >> 18) & Bits3) | Four); | |
buffer[1] = (byte)(((c >> 12) & Bits6) | Tailing); | |
buffer[2] = (byte)(((c >> 6) & Bits6) | Tailing); | |
buffer[3] = (byte)((c & Bits6) | Tailing); | |
return buffer.Slice(0, 4); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment