Last active
July 10, 2022 07:35
-
-
Save woodongk/7251c9559453127e9d0bf27965dfd198 to your computer and use it in GitHub Desktop.
Korean-Text-Preprocessing in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from konlpy.tag import Mecab | |
from khaiii import KhaiiiApi | |
def remove_brackets(string, left_paren_type,right_paren_type): | |
'''Remove brackets (parentheses) and their contents within a string | |
Args : | |
left_paren_type = '[','(' etc | |
right_paren_type = ']', ')' etc | |
Example: | |
>>> s = '[abababab] kdk[sbsbsb]dkdk' | |
>>> remove_brackets(s,'[',']') | |
'kdkdkdk' | |
''' | |
# 여러 괄호 처리 | |
while(left_paren_type in string): | |
# 괄호 있다면 | |
if right_paren_type in string: | |
## 괄호 시작과 끝 index 저장 | |
parenthesis_index=[] | |
for index,char in enumerate(string): | |
#print(index,char) | |
if char==left_paren_type: | |
parenthesis_index.append(index) | |
elif char==right_paren_type: | |
parenthesis_index.append(index) | |
else: | |
pass | |
## 괄호 시작과 끝 index 저장 | |
start = parenthesis_index[0] | |
end = parenthesis_index[1] | |
substring_to_delete = string[start:end+1] | |
string = string.replace(substring_to_delete,'').strip() | |
return string | |
def clean_text(sentence): | |
'''Return cleaned sentences | |
Example: | |
>>> s = '박세용 기자(psy05@sbs.co.kr)☞ [SBS 2017 대선] 나는 이런 대통령을 원한다!☞' | |
>>> clean_text(s) | |
'박세용 기자 나는 이런 대통령을 원한다' | |
''' | |
# bracket과 내부 내용 제거 | |
sentence = remove_brackets(sentence, '[',']') | |
sentence = remove_brackets(sentence, '(',')') | |
sentence = remove_brackets(sentence, '{','}') | |
# 특수 문자 제거 | |
sent_clean = re.sub('[-=+,#/\?:“”^$*\"※~&%ㆍ☞!』\\‘|\(\)\[\]\<\>`\'…》]', ' ', sentence) | |
# multiple space 제거 | |
sent_clean = re.sub(' +', ' ', sent_clean) | |
sent_clean = " ".join(sent_clean.split()) | |
sent_clean = sent_clean.strip() | |
return sent_clean | |
def tokenize_space(sentence): | |
'''Return cleaned and tokenized sentences | |
Example: | |
>>> s = '1987 본 문 대통령.."그런다고 바뀌나? 함께 하면 바뀐다"' | |
>>> tokenize_clean_text(s) | |
['1987', '본', '문', '대통령', '그런다고', '바뀌나', '함께', '하면', '바뀐다'] | |
''' | |
sentence = clean_text(sentence) | |
sent_tokened = [i for i in sent_clean.split(' ') if len(i)>0] | |
return sent_tokened | |
def tokenize_mecab(sentence): | |
""" | |
품사 분석을 진행한 뒤 관계언(조사 등)이나 기호를 제거한다. | |
""" | |
mecab = Mecab() | |
tagged = mecab.pos(sentence) | |
result = [] | |
for word, tag in tagged: | |
# 일반명사, 고유명사, 동사, 형용사 사용 | |
if (tag in ['NNP','NNG','IC']): | |
result.append(word) | |
elif (tag in ['VV','VA']): | |
result.append(word + '다') | |
result = [r for r in result if len(r)>1] | |
return result | |
def tokenize_khaiii(sentence): | |
khaiii = KhaiiiApi() | |
tagged = [] | |
for word in khaiii.analyze(sentence): | |
mos = [(m.lex,m.tag) for m in word.morphs] | |
tagged.extend(mos) | |
result = [] | |
for word, tag in tagged: | |
result.append(word) | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment