Skip to content

Instantly share code, notes, and snippets.

@takana-v
Created February 9, 2022 11:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save takana-v/3e639a59645c03e21b82de913d2c7a00 to your computer and use it in GitHub Desktop.
Save takana-v/3e639a59645c03e21b82de913d2c7a00 to your computer and use it in GitHub Desktop.
import unicodedata
hankaku_alphabet = "".join(chr(0x21 + i) for i in range(94))
zenkaku_alphabet = "".join(chr(0xff01 + i) for i in range(94))
translate_table = str.maketrans(hankaku_alphabet, zenkaku_alphabet)
original_dict = "/path/to/naist-jdic.csv"
normalized_dict = "/path/to/normalized.csv"
def text_normalize(text: str):
text = text.translate(translate_table)
ret = ""
for s in text:
if s in zenkaku_alphabet:
ret += s.lower()
else:
ret += unicodedata.normalize("NFC", s)
return ret
write_log = set()
with open(original_dict, encoding="utf-8") as f:
with open(normalized_dict, mode="w", encoding="utf-8") as f2:
for l in f:
l_splitted = l.split(",")
l_splitted[0] = text_normalize(l_splitted[0])
write_contents = ",".join(l_splitted)
log_contents = ",".join(l_splitted[0:10]+l_splitted[11:])
if log_contents in write_log:
continue
else:
write_log.add(log_contents)
f2.write(write_contents)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment