Skip to content

Instantly share code, notes, and snippets.

@woodongk
Last active July 10, 2022 07:35
Show Gist options
  • Save woodongk/7251c9559453127e9d0bf27965dfd198 to your computer and use it in GitHub Desktop.
Save woodongk/7251c9559453127e9d0bf27965dfd198 to your computer and use it in GitHub Desktop.
Korean-Text-Preprocessing in Python
import re
from konlpy.tag import Mecab
from khaiii import KhaiiiApi
def remove_brackets(string, left_paren_type,right_paren_type):
'''Remove brackets (parentheses) and their contents within a string
Args :
left_paren_type = '[','(' etc
right_paren_type = ']', ')' etc
Example:
>>> s = '[abababab] kdk[sbsbsb]dkdk'
>>> remove_brackets(s,'[',']')
'kdkdkdk'
'''
# 여러 괄호 처리
while(left_paren_type in string):
# 괄호 있다면
if right_paren_type in string:
## 괄호 시작과 끝 index 저장
parenthesis_index=[]
for index,char in enumerate(string):
#print(index,char)
if char==left_paren_type:
parenthesis_index.append(index)
elif char==right_paren_type:
parenthesis_index.append(index)
else:
pass
## 괄호 시작과 끝 index 저장
start = parenthesis_index[0]
end = parenthesis_index[1]
substring_to_delete = string[start:end+1]
string = string.replace(substring_to_delete,'').strip()
return string
def clean_text(sentence):
'''Return cleaned sentences
Example:
>>> s = '박세용 기자(psy05@sbs.co.kr)☞ [SBS 2017 대선] 나는 이런 대통령을 원한다!☞'
>>> clean_text(s)
'박세용 기자 나는 이런 대통령을 원한다'
'''
# bracket과 내부 내용 제거
sentence = remove_brackets(sentence, '[',']')
sentence = remove_brackets(sentence, '(',')')
sentence = remove_brackets(sentence, '{','}')
# 특수 문자 제거
sent_clean = re.sub('[-=+,#/\?:“”^$*\"※~&%ㆍ☞!』\\‘|\(\)\[\]\<\>`\'…》]', ' ', sentence)
# multiple space 제거
sent_clean = re.sub(' +', ' ', sent_clean)
sent_clean = " ".join(sent_clean.split())
sent_clean = sent_clean.strip()
return sent_clean
def tokenize_space(sentence):
'''Return cleaned and tokenized sentences
Example:
>>> s = '1987 본 문 대통령.."그런다고 바뀌나? 함께 하면 바뀐다"'
>>> tokenize_clean_text(s)
['1987', '본', '문', '대통령', '그런다고', '바뀌나', '함께', '하면', '바뀐다']
'''
sentence = clean_text(sentence)
sent_tokened = [i for i in sent_clean.split(' ') if len(i)>0]
return sent_tokened
def tokenize_mecab(sentence):
"""
품사 분석을 진행한 뒤 관계언(조사 등)이나 기호를 제거한다.
"""
mecab = Mecab()
tagged = mecab.pos(sentence)
result = []
for word, tag in tagged:
# 일반명사, 고유명사, 동사, 형용사 사용
if (tag in ['NNP','NNG','IC']):
result.append(word)
elif (tag in ['VV','VA']):
result.append(word + '다')
result = [r for r in result if len(r)>1]
return result
def tokenize_khaiii(sentence):
khaiii = KhaiiiApi()
tagged = []
for word in khaiii.analyze(sentence):
mos = [(m.lex,m.tag) for m in word.morphs]
tagged.extend(mos)
result = []
for word, tag in tagged:
result.append(word)
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment