Created
April 16, 2020 09:19
-
-
Save imcomking/085ce7e2088501da8df3b16c4778cb39 to your computer and use it in GitHub Desktop.
Korean subcharacter level unicode parsing python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
cho = "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ" # len = 19 | |
jung = "ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ" # len = 21 | |
jong = "ㄱ/ㄲ/ㄱㅅ/ㄴ/ㄴㅈ/ㄴㅎ/ㄷ/ㄹ/ㄹㄱ/ㄹㅁ/ㄹㅂ/ㄹㅅ/ㄹㅌ/ㄹㅍ/ㄹㅎ/ㅁ/ㅂ/ㅂㅅ/ㅅ/ㅆ/ㅇ/ㅈ/ㅊ/ㅋ/ㅌ/ㅍ/ㅎ".split('/') # len = 27 | |
test = cho + jung + ''.join(jong) | |
hangul_length = len(cho) + len(jung) + len(jong) # 67 | |
def is_valid_decomposition_atom(x): | |
return x in test | |
def decompose(x): | |
in_char = x | |
if x < ord('가') or x > ord('힣'): | |
return chr(x) | |
x = x - ord('가') | |
y = x // 28 | |
z = x % 28 | |
x = y // 21 | |
y = y % 21 | |
# if there is jong, then is z > 0. So z starts from 1 index. | |
zz = jong[z - 1] if z > 0 else '' | |
if x >= len(cho): | |
print('Unknown Exception: ', in_char, chr(in_char), x, y, z, zz) | |
return cho[x] + jung[y] + zz | |
def decompose_as_one_hot(in_char, warning=True): | |
one_hot = [] | |
# print(ord('ㅣ'), chr(0xac00)) | |
# [0,66]: hangul / [67,194]: ASCII / [195,245]: hangul danja,danmo / [246,249]: special characters | |
# Total 250 dimensions. | |
if ord('가') <= in_char <= ord('힣'): # 가:44032 , 힣: 55203 | |
x = in_char - 44032 # in_char - ord('가') | |
y = x // 28 | |
z = x % 28 | |
x = y // 21 | |
y = y % 21 | |
# if there is jong, then is z > 0. So z starts from 1 index. | |
zz = jong[z - 1] if z > 0 else '' | |
if x >= len(cho): | |
if warning: | |
print('Unknown Exception: ', in_char, chr(in_char), x, y, z, zz) | |
one_hot.append(x) | |
one_hot.append(len(cho) + y) | |
if z > 0: | |
one_hot.append(len(cho) + len(jung) + (z - 1)) | |
return one_hot | |
else: | |
if in_char < 128: | |
result = hangul_length + in_char # 67~ | |
elif ord('ㄱ') <= in_char <= ord('ㅣ'): | |
result = hangul_length + 128 + (in_char - 12593) # 194~ # [ㄱ:12593]~[ㅣ:12643] (len = 51) | |
elif in_char == ord('♡'): | |
result = hangul_length + 128 + 51 # 245~ # ♡ | |
elif in_char == ord('♥'): | |
result = hangul_length + 128 + 51 + 1 # ♥ | |
elif in_char == ord('★'): | |
result = hangul_length + 128 + 51 + 2 # ★ | |
elif in_char == ord('☆'): | |
result = hangul_length + 128 + 51 + 3 # ☆ | |
else: | |
if warning: | |
print('Unhandled character:', chr(in_char), in_char) | |
# unknown character | |
result = hangul_length + 128 + 51 + 4 # for unknown character | |
return [result] | |
def decompose_str(str): | |
return ''.join([decompose(ord(x)) for x in str]) | |
def decompose_str_as_one_hot(str, warning=True): | |
#print(str) | |
tmp_list = [] | |
for x in str: | |
da = decompose_as_one_hot(ord(x), warning=warning) | |
tmp_list.extend(da) | |
return tmp_list | |
if __name__ == '__main__': | |
print(decompose_str_as_one_hot('개인적으로 2가 제대로라고 생각하지만')) | |
print(decompose_str_as_one_hot('SF계의 최고의 수작. 다시봐도 매우 ')) | |
print(decompose_str_as_one_hot('개봉당시 최고의 재미를 선사했던')) | |
#print(decompose_str('각 맑은 하늘 고운 마음 밟')) | |
# print(decompose_str_as_one_hot('갛')) | |
# print(decompose_as_one_hot(0)) | |
# print(decompose_as_one_hot(127)) | |
# print(decompose_str_as_one_hot('ㄱㄺㅎㅏㅣ')) | |
# print(decompose_str_as_one_hot('♡♥★☆')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment