Skip to content

Instantly share code, notes, and snippets.

@miroblog
Last active January 22, 2021 02:18
Show Gist options
  • Select an option

  • Save miroblog/26f56170ba963cd93b2442c3fa53f4f0 to your computer and use it in GitHub Desktop.

Select an option

Save miroblog/26f56170ba963cd93b2442c3fa53f4f0 to your computer and use it in GitHub Desktop.
at_unicode
#
# Copyright 2017 Atlas Guide (Author : Lucas Jo)
#
# Apache 2.0
#
import unicodedata
import re
measureUnits = "".join(chr(i) for i in range(0xffff) if i >= 0x3380 and i<=0x33DD)
percents = ''.join(chr(i) for i in range(0xffff) \
if unicodedata.category(chr(i)) == 'Po' and re.search('PERCENT', unicodedata.name(chr(i))))
currencies = "".join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) == 'Sc')
quatation = ''.join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) in ['Pc', 'Pd', 'Pe', 'Pf', 'Pi',
'Po', 'Ps'] and re.search('QUOTATION', unicodedata.name(chr(i))))
apostrophe = ''.join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) in ['Pc', 'Pd', 'Pe', 'Pf', 'Pi',
'Po', 'Ps'] and re.search('APOSTROPHE', unicodedata.name(chr(i))))
userDefines = "-~+=%/:;"
puctuations = ".,?!'"
triangles = ''.join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) == 'So'
and re.search(' TRIANGLE\\b', unicodedata.name(chr(i))))
circles = ''.join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) == 'So'
and re.search(' CIRCLE\\b', unicodedata.name(chr(i))))
squares = ''.join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) == 'So'
and re.search(' SQUARE\\b', unicodedata.name(chr(i))))
separators = triangles + circles + squares
valids = measureUnits + percents + currencies + userDefines + puctuations
invalids_chars = r"[^ \n가-힣0-9a-zA-Z" + re.escape(valids) + r"]+"
valids_chars = r"[ \n가-힣0-9a-zA-Z" + re.escape(valids) + r"]+"
chinese = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE)
#3000-303F : punctuation
#3040-309F : hiragana
#30A0-30FF : katakana
#FF00-FFEF : Full-width roman + half-width katakana
#4E00-9FAF : Common and uncommon kanji
japanese = re.compile(u'[\u3040-\u309f\u30a0-\u30ff\uff00-\uffef\u4e00-\u9faf]', re.UNICODE)
userDefines_pronun={
'-': ['마이너스', '에서', '다시'],
'~': ['에서', '부터'],
'+': ['더하기', '플러스'],
#'=': ['는', '은', '이콜'],
'%': ['퍼센트', '프로', '퍼센티지'],
'/': ['나누기', '퍼', '슬래쉬'],
}
measureUnits_pronun = {
'㎀': ['피코 암페어'],
'㎁': ['나노 암페어'],
'㎂': ['마이크로 암페어'],
'㎃': ['밀리 암페어'],
'㎄': ['킬로 암페어'],
'㎅': ['킬로 바이트'],
'㎆': ['메가 바이트'],
'㎇': ['기가 바이트'],
'㎈': ['칼로리'],
'㎉': ['킬로 칼로리'],
'㎊': ['피코 페럿'],
'㎋': ['나노 페럿'],
'㎌': ['마이크로 페럿'],
'㎍': ['마이크로 그램'],
'㎎': ['밀리 그램'],
'㎏': ['킬로 그램'],
'㎐': ['헤르츠'],
'㎑': ['킬로 헤르츠'],
'㎒': ['메가 헤르츠'],
'㎓': ['기가 헤르츠'],
'㎔': ['킬로 헤르츠'],
'㎕': ['마이크로 리터'],
'㎖': ['밀리 리터'],
'㎗': ['데시 리터'],
'㎘': ['킬로 리터'],
'㎙': ['펨토 미터'],
'㎚': ['나노 미터'],
'㎛': ['마이크로 미터'],
'㎜': ['밀리 미터'],
'㎝': ['센티 미터'],
'㎞': ['킬로 미터'],
'㎟': ['제곱 밀리 미터'],
'㎠': ['제곱 센티 미터'],
'㎡': ['제곱 미터'],
'㎢': ['제곱 킬로 미터'],
'㎣': ['세 제곱 밀리 미터'],
'㎤': ['세 제곱 센티 미터'],
'㎥': ['세 제곱 미터'],
'㎦': ['세 제곱 킬로 미터'],
'㎧': ['미터 퍼 쎄크'],
'㎨': ['미터 퍼 제곱 쎄그'],
'㎩': ['파스칼'],
'㎪': ['킬로 파스칼'],
'㎫': ['메가 파스칼'],
'㎬': ['기가 파스칼'],
'㎭': ['라디안'],
'㎮': ['라디안 퍼 쎄크'],
'㎯': ['라디안 퍼 제곱 쎄크'],
'㎰': ['피코 쎄크'],
'㎱': ['나노 쎄크'],
'㎲': ['마이크로 쎄크'],
'㎳': ['밀리 쎄크'],
'㎴': ['피코 볼트'],
'㎵': ['나노 볼트'],
'㎶': ['마이크로 볼트'],
'㎷': ['밀리 볼트'],
'㎸': ['킬로 볼트'],
'㎹': ['메가 볼트'],
'㎺': ['피코 와트'],
'㎻': ['나노 와트'],
'㎼': ['마이크로 와트'],
'㎽': ['밀리 와트'],
'㎾': ['킬로 와트'],
'㎿': ['메가 와트'],
'㏀': ['킬로 옴'],
'㏁': ['메가 옴'],
'㏂': ['오전'],
'㏃': ['베크렐'],
'㏄': ['씨씨'],
'㏅': ['칸델라'],
'㏆': ['쿨롱 퍼 킬로 그램'],
'㏇': ['씨 오'],
'㏈': ['데시 벨'],
'㏉': ['그레이'],
'㏊': ['헥타르'],
'㏋': ['마력'],
'㏌': ['인치'],
'㏍': ['킬로 카이저'],
'㏎': ['킬로 미터'],
'㏏': ['킬로 톤'],
'㏐': ['루멘'],
'㏑': ['로그'],
'㏒': ['로그'],
'㏓': ['럭스'],
'㏔': ['밀리 바'],
'㏕': ['밀'],
'㏖': ['몰'],
'㏗': ['피 에이치'],
'㏘': ['오후'],
'㏙': ['피 피 엠'],
'㏚': ['피 알'],
'㏛': ['스테라디안'],
'㏜': ['시버트'],
'㏝': ['웨버']
}
currencies_pronun = {
'$': ['달러'],
'¢': ['센트'],
'£': ['파운드'],
'¤': ['화폐 표시'],
'¥': ['엔'],
'֏': ['드람'],
'؋': ['아프가니'],
'৲': ['루피 마크'],
'৳': ['루피 싸인'],
'৻': ['간다'],
'૱': ['루피'],
'௹': ['루피'],
'฿': ['바트'],
'៛': ['리엘'],
'₠': ['유로'],
'₡': ['콜론'],
'₢': ['크루제이로'],
'₣': ['프랑'],
'₤': ['리라'],
'₥': ['밀'],
'₦': ['나이라'],
'₧': ['페세타'],
'₨': ['루피'],
'₩': ['원'],
'₪': ['세겔'],
'₫': ['동'],
'€': ['유로'],
'₭': ['킵'],
'₮': ['터그릭'],
'₯': ['드라크마'],
'₰': ['페니'],
'₱': ['페소'],
'₲': ['과라니'],
'₳': ['오스트랄'],
'₴': ['리브니아'],
'₵': ['세디'],
'₶': ['토르노'],
'₷': ['스페스밀로'],
'₸': ['텐지'],
'₹': ['루피'],
'₺': ['리라'],
'₻': ['노르딕'],
'₼': ['마네'],
'₽': ['루블'],
'₾': ['라리'],
'꠸': ['루피'],
'﷼': ['리알'],
'﹩': ['달러'],
'$': ['달러'],
'¢': ['센트'],
'£': ['파운드'],
'¥': ['엔'],
'₩': ['원']
}
# TBD
# extracted from the corpus
validChars={
'℃': ['도', '도 섭씨', '도 씨'],
'㈜': ['주', '주식회사'],
'ρ': ['로'],
'μ': ['뮤', '마이크로'],
'µ': ['마이크로', '뮤'],
'W': ['와트'],
}
if __name__ == '__main__':
print(valids_chars)
import re
import sys
import at_unicode
MAX_NUMBER = 9999999999999999
readTextUnit = [['', '만', '억', '조'], '십', '백', '천']
readText = ['영', '일', '이', '삼', '사', '오', '육', '칠', '팔', '구', '']
readNumber = ['공', '일', '이', '삼', '사', '오', '육', '칠', '팔', '구', '']
readCountUnit = ['', '열', '스물', '서른', '마흔', '쉰', '예순', '일흔', '여든', '아흔']
readCount = [['', '하나', '둘', '셋', '넷', '다섯', '여섯', '일곱', '여덟', '아홉'],
['', '한', '두', '세', '네', '다섯', '여섯', '일곱', '여덟', '아홉']]
COUNT_UNIT = [
'배', '채', '개', '시', '말', '벌', '축', '톳', '손', '살', '죽', '쾌', '닢', '병', '건', '속', '주', \
'망', '포', '피', '미', '팩', '통', '줄', '봉', '단', '판', '모', '척', '번', '잔', '장', '쌍', '명', \
'마리', '가지', '방울', '자루', '켤레', '사람', '박스', '묶음', '보루', '봉지', '포기', '시루', \
]
def number2readNumber(numbers):
result = []
for number in reversed(numbers):
idxNum = int(number)
rNum = readNumber[idxNum]
# rNum = "["+readNumber[idxNum]+"]"
result.insert(0, rNum)
return " ".join(result)
# 숫자를 서수방식으로 읽기
# 1~99 사이 숫자만 지원
# Option
# 0: 뒤에 단위가 없을 때 (default)
# 1: 뒤에 단위가 있는 경우 사용
def number2readCount(numbers, option=1):
# numbers expected as a text variable
cnt = 0
result = []
if int(numbers) > 99:
sys.exit('Out-of-range: read count range is 1~99')
for number in reversed(numbers):
idxNum = int(number)
if cnt == 0:
res = readCount[option][idxNum]
else:
res = readCountUnit[idxNum]
# print(number, res)
if res:
# res = '['+res+']'
result.insert(0, res)
cnt += 1
return result
# return " ".join(result)
# 숫자를 기수방식을 읽기
# 최대숫자 9999,9999,9999,9999
# option1
# 0: 모두 기수방식으로 읽음 (default)
# 1: 백자리 아래를 서수로 읽음
# option2
# number2readCount option 참조
#
def number2readText(numbers, option1=0, option2=0):
# numbers expected as a text variable
cnt = 0
result = []
# pre-processing
numbers = numbers.lstrip("0")
if numbers == '':
numbers = "0"
if int(numbers) > MAX_NUMBER:
return number2readNumber(numbers)
for number in reversed(numbers):
idxNum = int(number)
prec = cnt % 4
if prec == 0:
# for every 4th location
rNum = readText[idxNum]
rLoc = ''
if cnt != 0: # 1's location ignore
rLoc = readTextUnit[0][cnt // 4]
# rLoc = "{"+readTextUnit[0][cnt//4]+"}"
res = rNum + ' ' + rLoc
else:
rNum = readText[idxNum] # 일, 이 ...
rLoc = readTextUnit[cnt % 4] # 천, 백 ...
# rLoc = "("+ readTextUnit[cnt%4] +")" # 천, 백 ...
res = rNum + rLoc
# Exceptions for '영'
if rNum in ['영', '[영]']:
if len(numbers) != 1:
# if rLoc in ['{만}', '{억}', '{조}']:
if rLoc in ['만', '억', '조']:
cLoc = len(numbers) - cnt
if numbers[cLoc - 4:cLoc] == '0000':
res = ''
else:
res = rLoc
else:
res = ''
else:
res = rNum
# Exceptions for '일'
if rNum == '일':
if cnt not in [12, 8, 4, 0]:
res = rLoc
else:
if cnt == 4 and len(numbers) == 5:
res = rLoc
# print(res, number, prec, cnt)
if res:
if prec != 0:
# res = '['+res+']'
res = res
result.insert(0, res)
cnt += 1
if option1:
rStr = number2readCount(numbers[-2:], option2)
result[-2:] = rStr
# 조/억/만 단위 띄어쓰기
outStr = " ".join(result)
return outStr
# outList = list(outStr)
# if '조' in outList:
# outList.insert(outList.index('조')+1,' ')
# if '억' in outList:
# outList.insert(outList.index('억')+1,' ')
# if '만' in outList:
# outList.insert(outList.index('만')+1,' ')
# return "".join(outList)
def convNumType3(match):
# regex: '(\-?)(\d+)(\.)(\d+)'
tStr = '['
if match.group(1):
tStr += '마이너스 '
g2 = number2readText(match.group(2), 0, 0)
tStr += g2
g4 = number2readNumber(match.group(4))
tStr += ' 쩜|. ' + g4
return tStr + ']'
def convNumType4(match):
# regex: '([\d\.]+)'
tStr = '['
tNum = match.group(0).split('.')
for elem in tNum:
tStr += number2readNumber(elem) + " "
return tStr + ']'
def convNumType5(match):
opt = 0
for elem in COUNT_UNIT:
# pitfall '미' is in '밀리미터' ...
if elem in match.group(7):
opt = 1
tStr = ' '
if match.group(1):
g1 = number2readText(match.group(1), opt, opt)
tStr += g1 + ' '
if int(match.group(5)) - int(match.group(1)) > 1:
tStr += " 에서"
g5 = number2readText(match.group(5), opt, opt)
tStr += ' ' + g5 + ' ' + match.group(7)
return tStr
def convNumType6(match):
opt = 0
for elem in COUNT_UNIT:
if elem in match.group(3):
opt = 1
tStr = ' '
g1 = number2readText(match.group(1), opt, opt)
tStr += g1 + ' '
if match.group(3):
tStr += match.group(3)
return tStr
def convNumType9(match):
tStr = ' '
# g1 = number2readText(match.group(1), 0, 0)
g1 = number2readNumber(match.group(1))
tStr += g1 + ' '
return tStr
def convNum_1(match):
matchedTxt = match.group(0)
tlist = matchedTxt.split('.')
tstr = ''
if len(tlist) == 3 and len(str(int(tlist[0]))) == 4:
tstr += number2readText(tlist[0].strip(), 0, 0) + " 년 "
tstr += number2readText(tlist[1].strip(), 0, 0) + " 월 "
tstr += number2readText(tlist[2].strip(), 0, 0) + " 일"
else:
for elem in tlist[:-1]:
tstr += number2readText(elem.strip(), 0, 0) + ' 쩜 '
tstr += number2readText(tlist[-1].strip(), 0, 0)
return tstr
def convNum_2(match):
matchedTxt = match.group(1)
tlist = matchedTxt.split('.')
tstr = ''
for elem in tlist[:-1]:
tstr += number2readText(elem.strip(), 0, 0) + ' 쩜 '
# tstr += number2readText(tlist[-1].strip(), 0, 0)
tstr += number2readNumber(tlist[-1].strip())
return tstr + " " + match.group(2)
def convNum_3(match):
matchedTxt = match.group(0)
tlist = matchedTxt.split('.')
tstr = ''
for elem in tlist[:-1]:
tstr += number2readText(elem.strip(), 0, 0) + ' [쩜] '
tstr += number2readNumber(tlist[-1].strip())
return tstr
def convNum_4(match):
matchedTxt = match.group(0)
tlist = matchedTxt.split('-')
tstr = ''
for elem in tlist[:-1]:
tstr += number2readText(elem.strip(), 0, 0) + ' '
tstr += number2readText(tlist[-1].strip(), 0, 0)
return tstr
def convNum_5(match):
matchedTxt = match.group(1)
return number2readText(matchedTxt, 1, 0) + " "
# return number2readText(matchedTxt, 0, 0) + " "
def convNum_6(match):
matchedTxt = match.group(1)
return " [쩜] " + number2readNumber(matchedTxt)
# could be a number with count-unit, leave it to lexicon dictionary
def convNum_7(match):
return "[" + match.group(1) + "]" + match.group(2)
def convNum_8(match):
matchedTxt = match.group(1)
tstr = number2readText(matchedTxt, 0, 0) + " "
# if match.group(2):
# tstr += match.group(2)
return tstr
def convNumType8(match):
num = match.group(1)
tNum = num.split(',')
num = "".join(tNum)
return " " + num + " "
def normalize4(lines):
normalized_lines = []
for line in lines:
tstr = line.strip()
# -----------
# numbers with ,: 123,456 --> 123456
tstr = re.sub('([^ 0-9]),([^ 0-9])', '\\1, \\2', tstr)
tstr = re.sub('([0-9]),([^ 0-9])', '\\1, \\2', tstr)
tstr = re.sub('([^ 0-9]),([0-9])', '\\1, \\2', tstr)
# tstr = re.sub('(?=.*[0-9].*)([0-9,]+)', convNumType8, tstr)
tstr = re.sub('\s([0-9][,0-9]{3,}[0-9])\s', convNumType8, tstr)
# tstr = re.sub(',', ' ', tstr)
# numbers with '.'
tstr = re.sub('\d+\.\s*\d+(\.\s*\d+)+', convNum_1, tstr) # 2016.1.2 or 1.2.1
tstr = re.sub('(\d+\.\d+)([^ 0-9A-Za-z]+)', convNum_2, tstr) # 1.23% [일] 쩜 [이] [삼]
tstr = re.sub('\d+\.\d+', convNum_3, tstr) # 1.23 [일] [쩜] [이] [삼], 3.1 운동
# tstr = re.sub('(\d+)\.', convNum_5,tstr) # 1.
# tstr = re.sub('\.', ' ', tstr)
tstr = re.sub('\s\.([0-9]+)', convNum_6, tstr) # .234
# numbers possively with count-unit (수량사)
# leave it as a numeric
# ex. 배추 1 박스 --> 배추 [1] 박스
tstr = re.sub('\\b(\d{1,2})(\s*[^ \]0-9]+)', convNum_7, tstr)
# segment (just for sure)
tstr = re.sub('(\S)\[', '\\1 [', tstr)
tstr = re.sub('\](\S)', '] \\1', tstr)
## convert all numeric into korean Numbers if there is no surrounding brackets
# words=tstr.split()
# for i in range(len(words)):
# if words[i][0] != '[' and words[i][-1] != ']':
# words[i] = re.sub('(\d+)(\S*)', convNum_8 , words[i])
# tstr= ' '.join(words)+'\n'
tstr = re.sub('(\\b\d{3,}\\b)', convNum_8, tstr)
tstr = re.sub('(\\b\d{1,}\\b)', convNum_5, tstr)
## remove brackets and . ,
##tstr = re.sub('[\[\]\.,]', '', tstr)
tstr = re.sub('[\[\]]', '', tstr)
tstr = re.sub('[\.\,\'\?\!]', ' ', tstr)
# segment sentences
# tstr = re.sub('\s+(['+re.escape(at_unicode.puctuations)+'])', '\\1', tstr)
# tstr = re.sub('([가-힣])\s*\.', '\\1.\n', tstr)
# tstr = re.sub('([가-힣])\s*([\.?!])\s*([^가-힣]+ )', '\\1\\2\n\\3', tstr)
# remove repeated characters
tstr = re.sub('(.)\\1{4,}', '\\1\\1\\1', tstr)
tstr = re.sub(r'(\ )+', ' ', tstr).strip()
normalized_lines.append(tstr)
return normalized_lines
def segment(match):
tstr = match.group(0)
tstr = re.sub('([\,\.\'\!\/])', ' ', tstr)
return tstr
def normalize3(lines):
normalized_lines = []
for line in lines:
tstr = line.strip()
# numbers with '.'
tstr = re.sub('(?=\S*[A-Z])(?=\S*[\.\,\'])\S*', segment, tstr)
tstr = re.sub('(?=\S*[A-Z])(?=\S*[0-9])\S*', segment, tstr)
tstr = re.sub(r'(\ )+', ' ', tstr).strip()
normalized_lines.append(tstr)
return normalized_lines
def normalize2(lines):
normalized_lines = []
for line in lines:
tstr = line.strip()
# remove meaningless start
tstr = re.sub('^[^0-9a-zA-Z가-힣]+', '', tstr)
# delete no-Hangul line
tstr = re.sub('^([^가-힣]+)$', ' ', tstr)
# ignore sentences with urls
if re.search('www', tstr): continue
if re.search('http', tstr): continue
if re.search('ftp', tstr): continue
# .
tstr = re.sub('\.\s*$', '.', tstr)
# ;
tstr = re.sub(';', '', tstr)
# ignore sentences with multi-variate pronunciation symbols
# too many, need to another approach
if re.search('[/\-=:~+]', tstr): continue
# remove ? !
# tstr = re.sub('[!?]', ' ', tstr)
tstr = re.sub('\s+([\.\,\'\!\?])', '\\1 ', tstr)
# . , should be removed after treating numerics
# filter sentence with [a-z] characters
# not convertable into Korean now, need transliteration
regexEpr = r"^[ \.,?!가-힣0-9A-Z" + re.escape(at_unicode.valids) + r"]+$"
if re.match(regexEpr, tstr):
tstr = re.sub(r'(\ )+', ' ', tstr).strip()
normalized_lines.append(tstr)
return normalized_lines
def normalize1(lines):
normalized_lines = []
for line in lines:
tstr = line
# empty line
if not line.strip():
continue
# separator (conventions)
tstr = re.sub('[' + re.escape(at_unicode.separators) + ']', '\n', tstr)
# remove bracked contents
tstr = re.sub('\([^\)]+\)', '', tstr)
tstr = re.sub('\[[^\]]+\]', '', tstr)
tstr = re.sub('【[^】]+】', '', tstr)
tstr = re.sub('\<[^\>]+\>', '', tstr)
# handle apostrophe
quotes = at_unicode.apostrophe + at_unicode.quatation
tstr = re.sub('([a-zA-Z])[' + re.escape(quotes) + ']([a-zA-Z])', '\\1<apostrophe>\\2', tstr)
tstr = re.sub('[' + re.escape(quotes) + ']', '', tstr)
tstr = re.sub('<apostrophe>', '\'', tstr)
# replace various percent into one
tstr = re.sub('[' + re.escape(at_unicode.percents) + ']', '%', tstr)
# miscellaneous
tstr = re.sub('%p', '% 포인트', tstr)
tstr = re.sub('±', '플러스 마이너스', tstr)
tstr = re.sub('[a-zA-Z0-9_.]+@[a-zA-Z0-9_.]*', ' ', tstr) # delete e-mail
# remove chinese and japanese characters
tstr = re.sub(at_unicode.chinese, '', tstr)
tstr = re.sub(at_unicode.japanese, '', tstr)
# segment b/w Hangul and non-Hangul
tstr = re.sub(r"([가-힣])([^ 가-힣])", r"\1 \2", tstr)
tstr = re.sub(r"([^ 가-힣])([가-힣])", r"\1 \2", tstr)
# segment b/w numerices and non-numerics
tstr = re.sub('([0-9])([^ \.\,0-9])', '\\1 \\2', tstr)
tstr = re.sub('([^ \+\-\.\,0-9])([0-9])', '\\1 \\2', tstr)
# Leave only valid characters
tstr = re.sub(at_unicode.invalids_chars, ' ', tstr)
# remove repeated valid symbols
tstr = re.sub('([' + re.escape(at_unicode.valids) + '])+', '\\1', tstr)
# make valid symbols, except puctuations, as a unique word
symbols = at_unicode.measureUnits + at_unicode.percents + at_unicode.currencies + at_unicode.userDefines
regexEpr = r"([" + re.escape(symbols) + "])"
tstr = re.sub(regexEpr, ' \\1 ', tstr)
# remove spaces before puctuations
# tstr = re.sub('\s+(['+re.escape(at_unicode.puctuations)+'])', '\\1', tstr)
# segment sentences
tstr = re.sub('([가-힣])\s*\.', '\\1.\n', tstr)
# segment sentences 2
tstr = re.sub('([가-힣])\s*([\.?!])\s*([^가-힣]+ )', '\\1\\2\n\\3', tstr)
# segment sentences 3
# / (not readable)
tstr = re.sub('([가-힣])\s+[/=:]\s+([가-힣])', '\\1\n\\2', tstr)
tstr = re.sub('([a-zA-Z])\s+[/=:]\s+([가-힣])', '\\1\n\\2', tstr)
tstr = re.sub('([가-힣])\s+[/=:]\s+([a-zA-Z])', '\\1\n\\2', tstr)
tstr = re.sub(r'(\ )+', ' ', tstr).strip()
normalized_lines.append(tstr)
return normalized_lines
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment