Created
January 20, 2018 14:21
-
-
Save ufcpp/0b1d03675739d624c6d53d8db316d0ea to your computer and use it in GitHub Desktop.
UTF-8 エンコードされたバイト列から文字を読みだす
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using static System.Console; | |
using static System.Globalization.CharUnicodeInfo; | |
class Program | |
{ | |
static void Main() | |
{ | |
// 𩸽の UTF-8 バイト列 | |
// ファイルとかネットとか、今時普通は UTF-8 で保存・送受信するでしょ | |
var utf8 = new byte[] { 240, 169, 184, 189 }; | |
// UTF-8 をデコード | |
// 𩸽のコードポイント U+29E3D が得られてるはず | |
var c = Decode(utf8, 0); | |
WriteLine(c.ToString("X")); // 29E3D | |
// このコードポイントから直接カテゴリーを得る手段がない | |
var category = GetUnicodeCategory(c); // ここでコンパイル エラー | |
WriteLine(category); | |
} | |
public static int Decode(byte[] buffer, int i) | |
{ | |
int x = buffer[i++]; | |
var count = | |
(x < 0b1100_0000U) ? (byte)1 : | |
(x < 0b1110_0000U) ? (byte)2 : | |
(x < 0b1111_0000U) ? (byte)3 : | |
(byte)4; | |
if (x < 0b1100_0000U) return x; | |
else if (x < 0b1110_0000U) | |
{ | |
var code = x & 0b1_1111; | |
return (code << 6) | buffer[i++] & 0b0011_1111; | |
} | |
else if (x < 0b1111_0000U) | |
{ | |
var code = x & 0b1111; | |
code = (code << 6) | buffer[i++] & 0b0011_1111; | |
return (code << 6) | buffer[i++] & 0b0011_1111; | |
} | |
else | |
{ | |
var code = x & 0b0111; | |
code = (code << 6) | buffer[i++] & 0b0011_1111; | |
code = (code << 6) | buffer[i++] & 0b0011_1111; | |
return (code << 6) | buffer[i++] & 0b0011_1111; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment