Skip to content

Instantly share code, notes, and snippets.

@ChenyangGao
Created October 14, 2023 06:22
Show Gist options
  • Save ChenyangGao/15452e9b93a9aea6c5951e537558d5a4 to your computer and use it in GitHub Desktop.
Save ChenyangGao/15452e9b93a9aea6c5951e537558d5a4 to your computer and use it in GitHub Desktop.
Unihan (UNICODE HAN DATABASE) Character Information Query Tool
#!/usr/bin/env python3
# coding: utf-8
"""Unihan (UNICODE HAN DATABASE) Character Information Query Tool
Latest version:
- https://www.unicode.org/reports/tr38/
Unihan source:
- https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
- https://www.unicode.org/Public/UCD/latest/ucd/
Other recommended projects:
- https://pypi.org/project/cihai/
- https://libunihan.sourceforge.net
"""
__author__ = "ChenyangGao <https://chenyanggao.github.io/>"
__version__ = (0, 0, 1)
__all__ = ["UNIHANDB", "info", "updatedb"]
from pathlib import Path as _Path
from tempfile import TemporaryDirectory as _TemporaryDirectory
from urllib.request import urlopen as _urlopen, urlretrieve as _urlretrieve
from zipfile import ZipFile as _ZipFile
try:
UNIHANDB_FILE = str(_Path(__file__).with_name("unihan.pkl"))
except NameError:
UNIHANDB_FILE = "unihan.pkl"
def _get_last_modified_time():
url = "https://www.unicode.org/Public/UCD/latest/ucd/"
content = _urlopen(url).read()
find_text = b'>Unihan.zip</a></td><td align="right">'
idx = content.index(find_text) + len(find_text)
return content[idx:idx+16].decode("ascii")
def _reporthook(blocks, block_size, total_size):
downloaded_size = blocks * block_size
if downloaded_size >= total_size:
print("\r\x1b[K", end="")
else:
pct = downloaded_size * 100 / total_size
print(f'\rdownloading | {downloaded_size} of {total_size} | {pct:.6f}%', end="")
def updatedb():
"""Update the local Unihan database.
Data Source: https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
"""
global UNIHANDB
try:
last_modified_time = _get_last_modified_time()
if UNIHANDB[""] >= last_modified_time:
print("^_^ already the latest version")
return
except NameError:
pass
url = "https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip"
unihandb = {"": last_modified_time}
with _TemporaryDirectory() as tmpdir:
path = _Path(tmpdir) / "Unihan.zip"
_urlretrieve(url, path, reporthook=_reporthook)
with _ZipFile(path) as zf:
for filename in zf.namelist():
with zf.open(filename) as f:
for l in f:
if l.startswith(b"U"):
l = l[:-1].decode("utf-8")
else:
continue
ucn, field, value = l.split(maxsplit=2)
codepoint = int(ucn[2:], 16)
try:
unihandb[codepoint][field] = value
except KeyError:
unihandb[codepoint] = {
"char": chr(codepoint),
"codepoint": codepoint,
"ucn": ucn,
field: value
}
UNIHANDB = unihandb
__import__("pickle").dump(unihandb, open(UNIHANDB_FILE, "wb"))
def info(char: int | str) -> dict:
"""Query the information of a Unihan (UNICODE HAN DATABASE) character.
:Reference:
- https://www.unicode.org/reports/tr38/
- https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
:param char: A unicode code point, URN (Uniform Resource Name) or single character of Unihan.
:return: Information related to the Unihan character.
"""
if isinstance(char, str):
if char.startswith("U+"):
codepoint = int(char[2:], 16)
else:
codepoint = ord(char)
else:
codepoint = char
try:
return dict(UNIHANDB[codepoint])
except KeyError as e:
raise ValueError(f"not a unihan: {char!r}") from e
try:
# The local Unihan database
UNIHANDB = __import__("pickle").load(open(UNIHANDB_FILE, "rb"))
except FileNotFoundError:
updatedb()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment