ChenyangGao/unihan.py

## unihan.py
#!/usr/bin/env python3
# coding: utf-8

"""Unihan (UNICODE HAN DATABASE) Character Information Query Tool

Latest version:
    - https://www.unicode.org/reports/tr38/
Unihan source:
    - https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
    - https://www.unicode.org/Public/UCD/latest/ucd/
Other recommended projects:
    - https://pypi.org/project/cihai/
    - https://libunihan.sourceforge.net
"""

__author__  = "ChenyangGao <https://chenyanggao.github.io/>"
__version__ = (0, 0, 1)
__all__ = ["UNIHANDB", "info", "updatedb"]

from pathlib import Path as _Path
from tempfile import TemporaryDirectory as _TemporaryDirectory
from urllib.request import urlopen as _urlopen, urlretrieve as _urlretrieve
from zipfile import ZipFile as _ZipFile


try:
    UNIHANDB_FILE = str(_Path(__file__).with_name("unihan.pkl"))
except NameError:
    UNIHANDB_FILE = "unihan.pkl"


def _get_last_modified_time():
    url = "https://www.unicode.org/Public/UCD/latest/ucd/"
    content = _urlopen(url).read()
    find_text = b'>Unihan.zip</a></td><td align="right">'
    idx = content.index(find_text) + len(find_text)
    return content[idx:idx+16].decode("ascii")


def _reporthook(blocks, block_size, total_size):
    downloaded_size = blocks * block_size
    if downloaded_size >= total_size:
        print("\r\x1b[K", end="")
    else:
        pct = downloaded_size * 100 / total_size
        print(f'\rdownloading | {downloaded_size} of {total_size} | {pct:.6f}%', end="")


def updatedb():
    """Update the local Unihan database.

    Data Source: https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
    """
    global UNIHANDB
    try:
        last_modified_time = _get_last_modified_time()
        if UNIHANDB[""] >= last_modified_time:
            print("^_^ already the latest version")
            return
    except NameError:
        pass
    url = "https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip"
    unihandb = {"": last_modified_time}
    with _TemporaryDirectory() as tmpdir:
        path = _Path(tmpdir) / "Unihan.zip"
        _urlretrieve(url, path, reporthook=_reporthook)
        with _ZipFile(path) as zf:
            for filename in zf.namelist():
                with zf.open(filename) as f:
                    for l in f:
                        if l.startswith(b"U"):
                            l = l[:-1].decode("utf-8")
                        else:
                            continue
                        ucn, field, value = l.split(maxsplit=2)
                        codepoint = int(ucn[2:], 16)
                        try:
                            unihandb[codepoint][field] = value
                        except KeyError:
                            unihandb[codepoint] = {
                                "char": chr(codepoint),
                                "codepoint": codepoint,
                                "ucn": ucn,
                                field: value
                            }
    UNIHANDB = unihandb
    __import__("pickle").dump(unihandb, open(UNIHANDB_FILE, "wb"))


def info(char: int | str) -> dict:
    """Query the information of a Unihan (UNICODE HAN DATABASE) character.

    :Reference:
        - https://www.unicode.org/reports/tr38/
        - https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip

    :param char: A unicode code point, URN (Uniform Resource Name) or single character of Unihan.

    :return: Information related to the Unihan character.
    """
    if isinstance(char, str):
        if char.startswith("U+"):
            codepoint = int(char[2:], 16)
        else:
            codepoint = ord(char)
    else:
        codepoint = char
    try:
        return dict(UNIHANDB[codepoint])
    except KeyError as e:
        raise ValueError(f"not a unihan: {char!r}") from e


try:
    # The local Unihan database
    UNIHANDB = __import__("pickle").load(open(UNIHANDB_FILE, "rb"))
except FileNotFoundError:
    updatedb()
	#!/usr/bin/env python3
	# coding: utf-8

	"""Unihan (UNICODE HAN DATABASE) Character Information Query Tool

	Latest version:
	- https://www.unicode.org/reports/tr38/
	Unihan source:
	- https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
	- https://www.unicode.org/Public/UCD/latest/ucd/
	Other recommended projects:
	- https://pypi.org/project/cihai/
	- https://libunihan.sourceforge.net
	"""

	__author__ = "ChenyangGao <https://chenyanggao.github.io/>"
	__version__ = (0, 0, 1)
	__all__ = ["UNIHANDB", "info", "updatedb"]

	from pathlib import Path as _Path
	from tempfile import TemporaryDirectory as _TemporaryDirectory
	from urllib.request import urlopen as _urlopen, urlretrieve as _urlretrieve
	from zipfile import ZipFile as _ZipFile


	try:
	UNIHANDB_FILE = str(_Path(__file__).with_name("unihan.pkl"))
	except NameError:
	UNIHANDB_FILE = "unihan.pkl"


	def _get_last_modified_time():
	url = "https://www.unicode.org/Public/UCD/latest/ucd/"
	content = _urlopen(url).read()
	find_text = b'>Unihan.zip</a></td><td align="right">'
	idx = content.index(find_text) + len(find_text)
	return content[idx:idx+16].decode("ascii")


	def _reporthook(blocks, block_size, total_size):
	downloaded_size = blocks * block_size
	if downloaded_size >= total_size:
	print("\r\x1b[K", end="")
	else:
	pct = downloaded_size * 100 / total_size
	print(f'\rdownloading \| {downloaded_size} of {total_size} \| {pct:.6f}%', end="")


	def updatedb():
	"""Update the local Unihan database.

	Data Source: https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
	"""
	global UNIHANDB
	try:
	last_modified_time = _get_last_modified_time()
	if UNIHANDB[""] >= last_modified_time:
	print("^_^ already the latest version")
	return
	except NameError:
	pass
	url = "https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip"
	unihandb = {"": last_modified_time}
	with _TemporaryDirectory() as tmpdir:
	path = _Path(tmpdir) / "Unihan.zip"
	_urlretrieve(url, path, reporthook=_reporthook)
	with _ZipFile(path) as zf:
	for filename in zf.namelist():
	with zf.open(filename) as f:
	for l in f:
	if l.startswith(b"U"):
	l = l[:-1].decode("utf-8")
	else:
	continue
	ucn, field, value = l.split(maxsplit=2)
	codepoint = int(ucn[2:], 16)
	try:
	unihandb[codepoint][field] = value
	except KeyError:
	unihandb[codepoint] = {
	"char": chr(codepoint),
	"codepoint": codepoint,
	"ucn": ucn,
	field: value
	}
	UNIHANDB = unihandb
	__import__("pickle").dump(unihandb, open(UNIHANDB_FILE, "wb"))


	def info(char: int \| str) -> dict:
	"""Query the information of a Unihan (UNICODE HAN DATABASE) character.

	:Reference:
	- https://www.unicode.org/reports/tr38/
	- https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip

	:param char: A unicode code point, URN (Uniform Resource Name) or single character of Unihan.

	:return: Information related to the Unihan character.
	"""
	if isinstance(char, str):
	if char.startswith("U+"):
	codepoint = int(char[2:], 16)
	else:
	codepoint = ord(char)
	else:
	codepoint = char
	try:
	return dict(UNIHANDB[codepoint])
	except KeyError as e:
	raise ValueError(f"not a unihan: {char!r}") from e


	try:
	# The local Unihan database
	UNIHANDB = __import__("pickle").load(open(UNIHANDB_FILE, "rb"))
	except FileNotFoundError:
	updatedb()