Skip to content

Instantly share code, notes, and snippets.

@imcomking
Created April 16, 2020 09:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save imcomking/085ce7e2088501da8df3b16c4778cb39 to your computer and use it in GitHub Desktop.
Save imcomking/085ce7e2088501da8df3b16c4778cb39 to your computer and use it in GitHub Desktop.
Korean subcharacter level unicode parsing python
# -*- coding: utf-8 -*-
cho = "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ" # len = 19
jung = "ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ" # len = 21
jong = "ㄱ/ㄲ/ㄱㅅ/ㄴ/ㄴㅈ/ㄴㅎ/ㄷ/ㄹ/ㄹㄱ/ㄹㅁ/ㄹㅂ/ㄹㅅ/ㄹㅌ/ㄹㅍ/ㄹㅎ/ㅁ/ㅂ/ㅂㅅ/ㅅ/ㅆ/ㅇ/ㅈ/ㅊ/ㅋ/ㅌ/ㅍ/ㅎ".split('/') # len = 27
test = cho + jung + ''.join(jong)
hangul_length = len(cho) + len(jung) + len(jong) # 67
def is_valid_decomposition_atom(x):
return x in test
def decompose(x):
in_char = x
if x < ord('가') or x > ord('힣'):
return chr(x)
x = x - ord('가')
y = x // 28
z = x % 28
x = y // 21
y = y % 21
# if there is jong, then is z > 0. So z starts from 1 index.
zz = jong[z - 1] if z > 0 else ''
if x >= len(cho):
print('Unknown Exception: ', in_char, chr(in_char), x, y, z, zz)
return cho[x] + jung[y] + zz
def decompose_as_one_hot(in_char, warning=True):
one_hot = []
# print(ord('ㅣ'), chr(0xac00))
# [0,66]: hangul / [67,194]: ASCII / [195,245]: hangul danja,danmo / [246,249]: special characters
# Total 250 dimensions.
if ord('가') <= in_char <= ord('힣'): # 가:44032 , 힣: 55203
x = in_char - 44032 # in_char - ord('가')
y = x // 28
z = x % 28
x = y // 21
y = y % 21
# if there is jong, then is z > 0. So z starts from 1 index.
zz = jong[z - 1] if z > 0 else ''
if x >= len(cho):
if warning:
print('Unknown Exception: ', in_char, chr(in_char), x, y, z, zz)
one_hot.append(x)
one_hot.append(len(cho) + y)
if z > 0:
one_hot.append(len(cho) + len(jung) + (z - 1))
return one_hot
else:
if in_char < 128:
result = hangul_length + in_char # 67~
elif ord('ㄱ') <= in_char <= ord('ㅣ'):
result = hangul_length + 128 + (in_char - 12593) # 194~ # [ㄱ:12593]~[ㅣ:12643] (len = 51)
elif in_char == ord('♡'):
result = hangul_length + 128 + 51 # 245~ # ♡
elif in_char == ord('♥'):
result = hangul_length + 128 + 51 + 1 # ♥
elif in_char == ord('★'):
result = hangul_length + 128 + 51 + 2 # ★
elif in_char == ord('☆'):
result = hangul_length + 128 + 51 + 3 # ☆
else:
if warning:
print('Unhandled character:', chr(in_char), in_char)
# unknown character
result = hangul_length + 128 + 51 + 4 # for unknown character
return [result]
def decompose_str(str):
return ''.join([decompose(ord(x)) for x in str])
def decompose_str_as_one_hot(str, warning=True):
#print(str)
tmp_list = []
for x in str:
da = decompose_as_one_hot(ord(x), warning=warning)
tmp_list.extend(da)
return tmp_list
if __name__ == '__main__':
print(decompose_str_as_one_hot('개인적으로 2가 제대로라고 생각하지만'))
print(decompose_str_as_one_hot('SF계의 최고의 수작. 다시봐도 매우 '))
print(decompose_str_as_one_hot('개봉당시 최고의 재미를 선사했던'))
#print(decompose_str('각 맑은 하늘 고운 마음 밟'))
# print(decompose_str_as_one_hot('갛'))
# print(decompose_as_one_hot(0))
# print(decompose_as_one_hot(127))
# print(decompose_str_as_one_hot('ㄱㄺㅎㅏㅣ'))
# print(decompose_str_as_one_hot('♡♥★☆'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment