Last active
July 8, 2024 20:54
-
-
Save RussellSprouts/6c5fbdf6689b8dc3f99406d28a9f7884 to your computer and use it in GitHub Desktop.
Finding the syllable block with the most strokes in real Korean words
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To use, `sudo apt install hunspell-ko`. | |
import unicodedata | |
f = open("/usr/share/myspell/dicts/ko.dic", "r") | |
# Ensure the text is composed so that each syllable is a single | |
# character. | |
normalized_text = unicodedata.normalize('NFC', f.read()) | |
stroke_counts = { | |
'KIYEOK': 1, | |
'SSANGKIYEOK': 2, | |
'NIEUN': 1, | |
'TIKEUT': 2, | |
'SSANGTIKEUT': 4, | |
'RIEUL': 3, | |
'MIEUM': 3, | |
'PIEUP': 4, | |
'SSANGPIEUP': 8, | |
'SIOS': 2, | |
'SSANGSIOS': 4, | |
'IEUNG': 1, | |
'CIEUC': 2, | |
'SSANGCIEUC': 4, | |
'CHIEUCH': 3, | |
'SSANGCHIEUCH': 6, | |
'KHIEUKH': 2, | |
'THIEUTH': 3, | |
'PHIEUPH': 4, | |
'HIEUH': 3, | |
'A': 2, | |
'AE': 3, | |
'YA': 3, | |
'YAE': 4, | |
'EO': 2, | |
'E': 3, | |
'YEO': 3, | |
'YE': 4, | |
'O': 2, | |
'WA': 4, | |
'WAE': 5, | |
'OE': 3, | |
'YO': 3, | |
'U': 2, | |
'WEO': 4, | |
'WE': 5, | |
'WI': 3, | |
'YU': 3, | |
'EU': 1, | |
'YI': 2, | |
'I': 1, | |
} | |
best_strokes = 0 | |
for line in normalized_text.splitlines(): | |
for char in line: | |
try: | |
name = unicodedata.name(char) | |
except: | |
pass | |
if 'HANGUL SYLLABLE' in name: | |
# Decompose the syllable into characters | |
jamo = unicodedata.normalize('NFD', char) | |
total_strokes = 0 | |
for j in jamo: | |
try: | |
jamo_name = unicodedata.name(j).split(' ')[-1] | |
except: | |
pass | |
total_strokes += sum([stroke_counts[a] for a in jamo_name.split('-')]) | |
if total_strokes > best_strokes: | |
print(char, total_strokes, line) | |
best_strokes = total_strokes | |
# Results: | |
# To use, `sudo apt install hunspell-ko`. | |
import unicodedata | |
f = open("/usr/share/myspell/dicts/ko.dic", "r") | |
# Ensure the text is composed so that each syllable is a single | |
# character. | |
normalized_text = unicodedata.normalize('NFC', f.read()) | |
stroke_counts = { | |
'KIYEOK': 1, | |
'SSANGKIYEOK': 2, | |
'NIEUN': 1, | |
'TIKEUT': 2, | |
'SSANGTIKEUT': 4, | |
'RIEUL': 3, | |
'MIEUM': 3, | |
'PIEUP': 4, | |
'SSANGPIEUP': 8, | |
'SIOS': 2, | |
'SSANGSIOS': 4, | |
'IEUNG': 1, | |
'CIEUC': 2, | |
'SSANGCIEUC': 4, | |
'CHIEUCH': 3, | |
'SSANGCHIEUCH': 6, | |
'KHIEUKH': 2, | |
'THIEUTH': 3, | |
'PHIEUPH': 4, | |
'HIEUH': 3, | |
'A': 2, | |
'AE': 3, | |
'YA': 3, | |
'YAE': 4, | |
'EO': 2, | |
'E': 3, | |
'YEO': 3, | |
'YE': 4, | |
'O': 2, | |
'WA': 4, | |
'WAE': 5, | |
'OE': 3, | |
'YO': 3, | |
'U': 2, | |
'WEO': 4, | |
'WE': 5, | |
'WI': 3, | |
'YU': 3, | |
'EU': 1, | |
'YI': 2, | |
'I': 1, | |
} | |
best_strokes = 0 | |
for line in normalized_text.splitlines(): | |
for char in line: | |
try: | |
name = unicodedata.name(char) | |
except: | |
pass | |
if 'HANGUL SYLLABLE' in name: | |
# Decompose the syllable into characters | |
jamo = unicodedata.normalize('NFD', char) | |
total_strokes = 0 | |
for j in jamo: | |
try: | |
jamo_name = unicodedata.name(j).split(' ')[-1] | |
except: | |
pass | |
total_strokes += sum([stroke_counts[a] for a in jamo_name.split('-')]) | |
if total_strokes >= best_strokes: | |
print(char, total_strokes, line) | |
best_strokes = total_strokes | |
# Results: | |
# 뺄 14 내뺄/4 | |
# 뽑 14 내뽑다/44 | |
# 뽑 14 내뽑아/2 | |
# 뽑 14 내뽑은/3 | |
# 뽑 14 내뽑을/4 | |
# 뺌 14 발뺌/10 | |
# 뺌 14 발뺌하다/44 | |
# 뺌 14 발뺌하여/2 | |
# 뺌 14 발뺌한/3 | |
# 뺌 14 발뺌할/4 | |
# 뺌 14 발뺌해/2 | |
# 뺄 14 뺄/4 | |
# 뺄 14 뺄셈/10 | |
# 뺨 14 뺨/10 | |
# 뺨 14 뺨따귀/25 | |
# 뺨 14 뺨쳐/2 | |
# 뺨 14 뺨치다/44 | |
# 뺨 14 뺨치어/2 | |
# 뺨 14 뺨친/3 | |
# 뺨 14 뺨칠/4 | |
# 뼘 14 뼘/10 | |
# 뽑 14 뽑다/44 | |
# 뽑 14 뽑아/2 | |
# 뽑 14 뽑아내다/44 | |
# 뽑 14 뽑은/3 | |
# 뽑 14 뽑을/4 | |
# 뽑 14 뽑혀/2 | |
# 뽑 14 뽑히다/44 | |
# 뽑 14 뽑히어/2 | |
# 뽑 14 뽑힌/3 | |
# 뽑 14 뽑힐/4 | |
# 뺄 14 용뺄/4 | |
# 뽑 14 제비뽑기/25 | |
# 뼘 14 한뼘바지/25 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment