Skip to content

Instantly share code, notes, and snippets.

@ufcpp
Created September 6, 2018 02:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ufcpp/1573a1a453bce1827b6b5025f79ed18a to your computer and use it in GitHub Desktop.
Save ufcpp/1573a1a453bce1827b6b5025f79ed18a to your computer and use it in GitHub Desktop.
Latin-1 文字の Unicode Category
// https://github.com/dotnet/coreclr/issues/19706 これを見て試しただけ。
// ほんとに Latin-1 文字だけだった。
//
// (§, A7, OtherSymbol, OtherPunctuation)
// (a, AA, LowercaseLetter, OtherLetter)
// (-, AD, DashPunctuation, Format)
// (¶, B6, OtherSymbol, OtherPunctuation)
// (o, BA, LowercaseLetter, OtherLetter)
//
// char の方は Unicode 4.0、CharUnicodeInfo の方は Unicode 5.0 の定義に沿ってるらしい?
//
// 日本語の ・(U+30FB, KATAKANA MIDDLE DOT)も同時期(Unicode 5.0 のとき)に変更されてるっぽいけど、
// ・は普通に char と CharUnicodeInfo で結果一致するのに…
using System.Globalization;
using static System.Console;
static class Program
{
static void Main()
{
for (int i = 0; i < 0xffff; i++)
{
var c = (char)i;
var c1 = char.GetUnicodeCategory(c);
var c2 = CharUnicodeInfo.GetUnicodeCategory(c);
if (c1 != c2) WriteLine((c, i.ToString("X"), c1, c2));
}
for (int i = 0x10000; i < 0x10ffff; i++)
{
var c = char.ConvertFromUtf32(i);
var c1 = char.GetUnicodeCategory(c, 0);
var c2 = CharUnicodeInfo.GetUnicodeCategory(c, 0);
if (c1 != c2) WriteLine((c, i.ToString("X"), c1, c2));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment