Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
集保罕字 Big5 轉 Unicode
using System;
using System.IO;
using System.Text;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Globalization;
namespace TdccBig5
{
public static class TdccBig5ToUnicode
{
// https://www.tdcc.com.tw/portal/zh/download/downloadArea?offset=&title=&subTitle=&parent=402897956c6bbcd6016c6bbd6bee0261&subCategory=0081f82f6c4b1d8f016c4b221a3a0002&keywords=
// 罕用字型及對照檔
private readonly static Dictionary<string, string> _mapping = new Dictionary<string, string>()
{
// big5 code, unicode codepoint, string, pinyin
// ...
{ "994B", "05553" }, // 啓 #### ㄑㄧˇ
// ...
};
public static string Transform(string tdccString)
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
// big5 custom fonts blocks
// FA40-FEFE + 8E40-A0FE + 8140-8DFE
int range1Begin = Convert.ToInt32(0xFA40);
int range1End = Convert.ToInt32(0xFEFE);
int range2Begin = Convert.ToInt32(0x8E40);
int range2End = Convert.ToInt32(0xA0FE);
int range3Begin = Convert.ToInt32(0x8140);
int range3End = Convert.ToInt32(0x8DFE);
try
{
string unicodeString = "";
for (int i = 0; i < tdccString.Length; i++)
{
string word = tdccString.Substring(i, 1);
byte[] big5Bytes = Encoding.GetEncoding("BIG5").GetBytes(word);
if (BitConverter.IsLittleEndian)
{
Array.Reverse(big5Bytes);
}
int value = BitConverter.ToUInt16(big5Bytes, 0);
if (
(value >= range1Begin && value <= range1End) ||
(value >= range2Begin && value <= range2End) ||
(value >= range3Begin && value <= range3End))
{
string key = value.ToString("X");
if (TdccBig5ToUnicode._mapping.ContainsKey(key))
{
int codepoint = int.Parse(TdccBig5ToUnicode._mapping[key], NumberStyles.HexNumber);
unicodeString += Char.ConvertFromUtf32(codepoint);
}
else {
unicodeString += word;
}
}
else
{
unicodeString += word;
}
}
return unicodeString;
}
catch (Exception ex)
{
// fallback
return tdccString;
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment