Last active
December 11, 2025 14:02
-
-
Save harjitmoe/785d9a589df0d611acd50d9f2f936a6f to your computer and use it in GitHub Desktop.
Uses GlyphWiki data to determine which SVSes and IVSes for a codepoint refer to the same glyph.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # -*- mode: python; coding: utf-8 -*- | |
| __doc__ = """\ | |
| Uses GlyphWiki data to determine which SVSes and IVSes for a codepoint refer to the same glyph. | |
| Usage: glyphwiki_aliased_ivses.py < dump_newest_only.txt > duplicate_ivses.txt | |
| GlyphWiki data download instructions: | |
| https://en.glyphwiki.org/wiki/GlyphWiki:AdvancedApplication#i0 | |
| https://en.glyphwiki.org/wiki/GlyphWiki:高度な活用方法#i0 | |
| GlyphWiki data download: https://glyphwiki.org/dump.tar.gz | |
| """ | |
| __author__ = __copyright__ = __copying__ = """\ | |
| By HarJIT in 2025. | |
| This Source Code Form is subject to the terms of the Mozilla Public | |
| License, v. 2.0. If a copy of the MPL was not distributed with this | |
| file, You can obtain one at https://mozilla.org/MPL/2.0/. | |
| """ | |
| import hashlib, sys | |
| if sys.stdin.isatty() or sys.argv[1:]: | |
| print(__doc__, file=sys.stderr) | |
| sys.exit(1) | |
| alias_sets = {} | |
| for line in sys.stdin: | |
| if line.startswith("(") or not line.strip().strip("+-"): | |
| continue | |
| name, ucs, data = [column.strip() for column in line.split("|")] | |
| if not (name.startswith("u") and ("-ue01" in name or "-ufe0" in name)): | |
| continue | |
| if data.startswith("99:0:0:0:0:200:200:") and ":" not in data.removeprefix("99:0:0:0:0:200:200:"): | |
| target_name = data.removeprefix("99:0:0:0:0:200:200:").split("$", 1)[0] | |
| else: | |
| target_name = name | |
| composite_name = name.split("-", 1)[0] + "/" + target_name | |
| key = int.from_bytes(hashlib.sha512(composite_name.encode("utf-8")).digest()[:8]) | |
| alias_sets.setdefault(key, set()).add(name) | |
| ambiguous_svses = ( | |
| "U+8612+FE00", # JIS X 0213 glyph versus GBK 1.0 glyph | |
| "U+9F9C+FE00") # KS X 1001 glyph versus HKSCS glyph | |
| for aliases in sorted([i for i in alias_sets.values() if len(i) > 1], key=lambda i: (int(next(iter(i)).split("-", 1)[0].removeprefix("u"), 16), min(i))): | |
| transformed = sorted([i.replace("-u", "+").replace("u", "U+").upper() for i in aliases], | |
| key=lambda i: (i in ambiguous_svses, int(i.split("+")[-1], 16), i)) | |
| for i in transformed[1:]: | |
| print(i, "→", transformed[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment