Created
June 14, 2020 07:02
-
-
Save thstarshine/9768c10a6e9bd172fd9c198ee2768db3 to your computer and use it in GitHub Desktop.
集保罕字 Big5 轉 Unicode
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.IO; | |
using System.Text; | |
using System.Collections.Generic; | |
using System.Text.RegularExpressions; | |
using System.Globalization; | |
namespace TdccBig5 | |
{ | |
public static class TdccBig5ToUnicode | |
{ | |
// https://www.tdcc.com.tw/portal/zh/download/downloadArea?offset=&title=&subTitle=&parent=402897956c6bbcd6016c6bbd6bee0261&subCategory=0081f82f6c4b1d8f016c4b221a3a0002&keywords= | |
// 罕用字型及對照檔 | |
private readonly static Dictionary<string, string> _mapping = new Dictionary<string, string>() | |
{ | |
// big5 code, unicode codepoint, string, pinyin | |
// ... | |
{ "994B", "05553" }, // 啓 #### ㄑㄧˇ | |
// ... | |
}; | |
public static string Transform(string tdccString) | |
{ | |
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); | |
// big5 custom fonts blocks | |
// FA40-FEFE + 8E40-A0FE + 8140-8DFE | |
int range1Begin = Convert.ToInt32(0xFA40); | |
int range1End = Convert.ToInt32(0xFEFE); | |
int range2Begin = Convert.ToInt32(0x8E40); | |
int range2End = Convert.ToInt32(0xA0FE); | |
int range3Begin = Convert.ToInt32(0x8140); | |
int range3End = Convert.ToInt32(0x8DFE); | |
try | |
{ | |
string unicodeString = ""; | |
for (int i = 0; i < tdccString.Length; i++) | |
{ | |
string word = tdccString.Substring(i, 1); | |
byte[] big5Bytes = Encoding.GetEncoding("BIG5").GetBytes(word); | |
if (BitConverter.IsLittleEndian) | |
{ | |
Array.Reverse(big5Bytes); | |
} | |
int value = BitConverter.ToUInt16(big5Bytes, 0); | |
if ( | |
(value >= range1Begin && value <= range1End) || | |
(value >= range2Begin && value <= range2End) || | |
(value >= range3Begin && value <= range3End)) | |
{ | |
string key = value.ToString("X"); | |
if (TdccBig5ToUnicode._mapping.ContainsKey(key)) | |
{ | |
int codepoint = int.Parse(TdccBig5ToUnicode._mapping[key], NumberStyles.HexNumber); | |
unicodeString += Char.ConvertFromUtf32(codepoint); | |
} | |
else { | |
unicodeString += word; | |
} | |
} | |
else | |
{ | |
unicodeString += word; | |
} | |
} | |
return unicodeString; | |
} | |
catch (Exception ex) | |
{ | |
// fallback | |
return tdccString; | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment