Skip to content

Instantly share code, notes, and snippets.

@wareya
Last active March 13, 2018 00:04
Show Gist options
  • Save wareya/abce9ca092c516493aa36f774f2180b5 to your computer and use it in GitHub Desktop.
Save wareya/abce9ca092c516493aa36f774f2180b5 to your computer and use it in GitHub Desktop.
search for han characters based on a list of components
#!python
# coding=utf-8
# Get ids.txt from https://github.com/cjkvi/cjkvi-ids/ and place it next to this script
# ~requires python 3.6 or newer on windows~
# note: depends on the accuracy of ids.txt. for some characters, like 祭, it's pretty bad.
# see also: http://www.chise.org/ids-find
contains = {}
def is_descriptor(c):
c = ord(c)
if c < 0x2FF0 or c > 0x2FFB:
return False
else:
return True
import argparse, sys, re
def find_recursive(c, first=True):
recurse = set()
if c in contains and (first or not args.norecurse):
if not first:
recurse = recurse | set([c])
for n in contains[c]:
recurse = recurse | find_recursive(n, False)
else:
recurse = recurse | set([c])
return recurse
def find_string(string):
string = string.strip()
sets = []
for char in string:
myset = find_recursive(char)
sets += [myset]
myset = sets[0]
for nextset in sets:
myset = myset & nextset
return myset
def force_print(string):
sys.stdout.buffer.write(string.encode("utf-8"))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Ideographic Description Sequence tool")
parser.add_argument("lookup_char", help="String of components to find in characters.")
parser.add_argument("-r", "--reverse", dest="reverse", action="store_true", help="Decompose instead of compose.")
parser.add_argument("-n", "--norecurse", dest="norecurse", action="store_true", help="First level of recursion only.")
args = parser.parse_args()
with open("ids.txt", encoding="utf-8") as file:
for s in file:
s = re.sub(r"\[[^\]]*\]", "", s)
fields = s.split("\t")
if len(fields) < 3:
continue
char = fields[1].strip()
# fields 2+ are each particular decompositions
ids = "".join(fields[2:]).strip()
for c in ids:
if is_descriptor(c):
continue
if c == char:
continue
if not args.reverse:
if c not in contains:
contains[c] = set()
contains[c].add(char)
else:
if char not in contains:
contains[char] = set()
contains[char].add(c)
force_print("\n".join(sorted(find_string(args.lookup_char))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment