Last active
January 22, 2021 02:18
-
-
Save miroblog/26f56170ba963cd93b2442c3fa53f4f0 to your computer and use it in GitHub Desktop.
at_unicode
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # | |
| # Copyright 2017 Atlas Guide (Author : Lucas Jo) | |
| # | |
| # Apache 2.0 | |
| # | |
| import unicodedata | |
| import re | |
| measureUnits = "".join(chr(i) for i in range(0xffff) if i >= 0x3380 and i<=0x33DD) | |
| percents = ''.join(chr(i) for i in range(0xffff) \ | |
| if unicodedata.category(chr(i)) == 'Po' and re.search('PERCENT', unicodedata.name(chr(i)))) | |
| currencies = "".join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) == 'Sc') | |
| quatation = ''.join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) in ['Pc', 'Pd', 'Pe', 'Pf', 'Pi', | |
| 'Po', 'Ps'] and re.search('QUOTATION', unicodedata.name(chr(i)))) | |
| apostrophe = ''.join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) in ['Pc', 'Pd', 'Pe', 'Pf', 'Pi', | |
| 'Po', 'Ps'] and re.search('APOSTROPHE', unicodedata.name(chr(i)))) | |
| userDefines = "-~+=%/:;" | |
| puctuations = ".,?!'" | |
| triangles = ''.join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) == 'So' | |
| and re.search(' TRIANGLE\\b', unicodedata.name(chr(i)))) | |
| circles = ''.join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) == 'So' | |
| and re.search(' CIRCLE\\b', unicodedata.name(chr(i)))) | |
| squares = ''.join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) == 'So' | |
| and re.search(' SQUARE\\b', unicodedata.name(chr(i)))) | |
| separators = triangles + circles + squares | |
| valids = measureUnits + percents + currencies + userDefines + puctuations | |
| invalids_chars = r"[^ \n가-힣0-9a-zA-Z" + re.escape(valids) + r"]+" | |
| valids_chars = r"[ \n가-힣0-9a-zA-Z" + re.escape(valids) + r"]+" | |
| chinese = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE) | |
| #3000-303F : punctuation | |
| #3040-309F : hiragana | |
| #30A0-30FF : katakana | |
| #FF00-FFEF : Full-width roman + half-width katakana | |
| #4E00-9FAF : Common and uncommon kanji | |
| japanese = re.compile(u'[\u3040-\u309f\u30a0-\u30ff\uff00-\uffef\u4e00-\u9faf]', re.UNICODE) | |
| userDefines_pronun={ | |
| '-': ['마이너스', '에서', '다시'], | |
| '~': ['에서', '부터'], | |
| '+': ['더하기', '플러스'], | |
| #'=': ['는', '은', '이콜'], | |
| '%': ['퍼센트', '프로', '퍼센티지'], | |
| '/': ['나누기', '퍼', '슬래쉬'], | |
| } | |
| measureUnits_pronun = { | |
| '㎀': ['피코 암페어'], | |
| '㎁': ['나노 암페어'], | |
| '㎂': ['마이크로 암페어'], | |
| '㎃': ['밀리 암페어'], | |
| '㎄': ['킬로 암페어'], | |
| '㎅': ['킬로 바이트'], | |
| '㎆': ['메가 바이트'], | |
| '㎇': ['기가 바이트'], | |
| '㎈': ['칼로리'], | |
| '㎉': ['킬로 칼로리'], | |
| '㎊': ['피코 페럿'], | |
| '㎋': ['나노 페럿'], | |
| '㎌': ['마이크로 페럿'], | |
| '㎍': ['마이크로 그램'], | |
| '㎎': ['밀리 그램'], | |
| '㎏': ['킬로 그램'], | |
| '㎐': ['헤르츠'], | |
| '㎑': ['킬로 헤르츠'], | |
| '㎒': ['메가 헤르츠'], | |
| '㎓': ['기가 헤르츠'], | |
| '㎔': ['킬로 헤르츠'], | |
| '㎕': ['마이크로 리터'], | |
| '㎖': ['밀리 리터'], | |
| '㎗': ['데시 리터'], | |
| '㎘': ['킬로 리터'], | |
| '㎙': ['펨토 미터'], | |
| '㎚': ['나노 미터'], | |
| '㎛': ['마이크로 미터'], | |
| '㎜': ['밀리 미터'], | |
| '㎝': ['센티 미터'], | |
| '㎞': ['킬로 미터'], | |
| '㎟': ['제곱 밀리 미터'], | |
| '㎠': ['제곱 센티 미터'], | |
| '㎡': ['제곱 미터'], | |
| '㎢': ['제곱 킬로 미터'], | |
| '㎣': ['세 제곱 밀리 미터'], | |
| '㎤': ['세 제곱 센티 미터'], | |
| '㎥': ['세 제곱 미터'], | |
| '㎦': ['세 제곱 킬로 미터'], | |
| '㎧': ['미터 퍼 쎄크'], | |
| '㎨': ['미터 퍼 제곱 쎄그'], | |
| '㎩': ['파스칼'], | |
| '㎪': ['킬로 파스칼'], | |
| '㎫': ['메가 파스칼'], | |
| '㎬': ['기가 파스칼'], | |
| '㎭': ['라디안'], | |
| '㎮': ['라디안 퍼 쎄크'], | |
| '㎯': ['라디안 퍼 제곱 쎄크'], | |
| '㎰': ['피코 쎄크'], | |
| '㎱': ['나노 쎄크'], | |
| '㎲': ['마이크로 쎄크'], | |
| '㎳': ['밀리 쎄크'], | |
| '㎴': ['피코 볼트'], | |
| '㎵': ['나노 볼트'], | |
| '㎶': ['마이크로 볼트'], | |
| '㎷': ['밀리 볼트'], | |
| '㎸': ['킬로 볼트'], | |
| '㎹': ['메가 볼트'], | |
| '㎺': ['피코 와트'], | |
| '㎻': ['나노 와트'], | |
| '㎼': ['마이크로 와트'], | |
| '㎽': ['밀리 와트'], | |
| '㎾': ['킬로 와트'], | |
| '㎿': ['메가 와트'], | |
| '㏀': ['킬로 옴'], | |
| '㏁': ['메가 옴'], | |
| '㏂': ['오전'], | |
| '㏃': ['베크렐'], | |
| '㏄': ['씨씨'], | |
| '㏅': ['칸델라'], | |
| '㏆': ['쿨롱 퍼 킬로 그램'], | |
| '㏇': ['씨 오'], | |
| '㏈': ['데시 벨'], | |
| '㏉': ['그레이'], | |
| '㏊': ['헥타르'], | |
| '㏋': ['마력'], | |
| '㏌': ['인치'], | |
| '㏍': ['킬로 카이저'], | |
| '㏎': ['킬로 미터'], | |
| '㏏': ['킬로 톤'], | |
| '㏐': ['루멘'], | |
| '㏑': ['로그'], | |
| '㏒': ['로그'], | |
| '㏓': ['럭스'], | |
| '㏔': ['밀리 바'], | |
| '㏕': ['밀'], | |
| '㏖': ['몰'], | |
| '㏗': ['피 에이치'], | |
| '㏘': ['오후'], | |
| '㏙': ['피 피 엠'], | |
| '㏚': ['피 알'], | |
| '㏛': ['스테라디안'], | |
| '㏜': ['시버트'], | |
| '㏝': ['웨버'] | |
| } | |
| currencies_pronun = { | |
| '$': ['달러'], | |
| '¢': ['센트'], | |
| '£': ['파운드'], | |
| '¤': ['화폐 표시'], | |
| '¥': ['엔'], | |
| '֏': ['드람'], | |
| '؋': ['아프가니'], | |
| '৲': ['루피 마크'], | |
| '৳': ['루피 싸인'], | |
| '৻': ['간다'], | |
| '૱': ['루피'], | |
| '௹': ['루피'], | |
| '฿': ['바트'], | |
| '៛': ['리엘'], | |
| '₠': ['유로'], | |
| '₡': ['콜론'], | |
| '₢': ['크루제이로'], | |
| '₣': ['프랑'], | |
| '₤': ['리라'], | |
| '₥': ['밀'], | |
| '₦': ['나이라'], | |
| '₧': ['페세타'], | |
| '₨': ['루피'], | |
| '₩': ['원'], | |
| '₪': ['세겔'], | |
| '₫': ['동'], | |
| '€': ['유로'], | |
| '₭': ['킵'], | |
| '₮': ['터그릭'], | |
| '₯': ['드라크마'], | |
| '₰': ['페니'], | |
| '₱': ['페소'], | |
| '₲': ['과라니'], | |
| '₳': ['오스트랄'], | |
| '₴': ['리브니아'], | |
| '₵': ['세디'], | |
| '₶': ['토르노'], | |
| '₷': ['스페스밀로'], | |
| '₸': ['텐지'], | |
| '₹': ['루피'], | |
| '₺': ['리라'], | |
| '₻': ['노르딕'], | |
| '₼': ['마네'], | |
| '₽': ['루블'], | |
| '₾': ['라리'], | |
| '꠸': ['루피'], | |
| '﷼': ['리알'], | |
| '﹩': ['달러'], | |
| '$': ['달러'], | |
| '¢': ['센트'], | |
| '£': ['파운드'], | |
| '¥': ['엔'], | |
| '₩': ['원'] | |
| } | |
| # TBD | |
| # extracted from the corpus | |
| validChars={ | |
| '℃': ['도', '도 섭씨', '도 씨'], | |
| '㈜': ['주', '주식회사'], | |
| 'ρ': ['로'], | |
| 'μ': ['뮤', '마이크로'], | |
| 'µ': ['마이크로', '뮤'], | |
| 'W': ['와트'], | |
| } | |
| if __name__ == '__main__': | |
| print(valids_chars) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import sys | |
| import at_unicode | |
| MAX_NUMBER = 9999999999999999 | |
| readTextUnit = [['', '만', '억', '조'], '십', '백', '천'] | |
| readText = ['영', '일', '이', '삼', '사', '오', '육', '칠', '팔', '구', ''] | |
| readNumber = ['공', '일', '이', '삼', '사', '오', '육', '칠', '팔', '구', ''] | |
| readCountUnit = ['', '열', '스물', '서른', '마흔', '쉰', '예순', '일흔', '여든', '아흔'] | |
| readCount = [['', '하나', '둘', '셋', '넷', '다섯', '여섯', '일곱', '여덟', '아홉'], | |
| ['', '한', '두', '세', '네', '다섯', '여섯', '일곱', '여덟', '아홉']] | |
| COUNT_UNIT = [ | |
| '배', '채', '개', '시', '말', '벌', '축', '톳', '손', '살', '죽', '쾌', '닢', '병', '건', '속', '주', \ | |
| '망', '포', '피', '미', '팩', '통', '줄', '봉', '단', '판', '모', '척', '번', '잔', '장', '쌍', '명', \ | |
| '마리', '가지', '방울', '자루', '켤레', '사람', '박스', '묶음', '보루', '봉지', '포기', '시루', \ | |
| ] | |
| def number2readNumber(numbers): | |
| result = [] | |
| for number in reversed(numbers): | |
| idxNum = int(number) | |
| rNum = readNumber[idxNum] | |
| # rNum = "["+readNumber[idxNum]+"]" | |
| result.insert(0, rNum) | |
| return " ".join(result) | |
| # 숫자를 서수방식으로 읽기 | |
| # 1~99 사이 숫자만 지원 | |
| # Option | |
| # 0: 뒤에 단위가 없을 때 (default) | |
| # 1: 뒤에 단위가 있는 경우 사용 | |
| def number2readCount(numbers, option=1): | |
| # numbers expected as a text variable | |
| cnt = 0 | |
| result = [] | |
| if int(numbers) > 99: | |
| sys.exit('Out-of-range: read count range is 1~99') | |
| for number in reversed(numbers): | |
| idxNum = int(number) | |
| if cnt == 0: | |
| res = readCount[option][idxNum] | |
| else: | |
| res = readCountUnit[idxNum] | |
| # print(number, res) | |
| if res: | |
| # res = '['+res+']' | |
| result.insert(0, res) | |
| cnt += 1 | |
| return result | |
| # return " ".join(result) | |
| # 숫자를 기수방식을 읽기 | |
| # 최대숫자 9999,9999,9999,9999 | |
| # option1 | |
| # 0: 모두 기수방식으로 읽음 (default) | |
| # 1: 백자리 아래를 서수로 읽음 | |
| # option2 | |
| # number2readCount option 참조 | |
| # | |
| def number2readText(numbers, option1=0, option2=0): | |
| # numbers expected as a text variable | |
| cnt = 0 | |
| result = [] | |
| # pre-processing | |
| numbers = numbers.lstrip("0") | |
| if numbers == '': | |
| numbers = "0" | |
| if int(numbers) > MAX_NUMBER: | |
| return number2readNumber(numbers) | |
| for number in reversed(numbers): | |
| idxNum = int(number) | |
| prec = cnt % 4 | |
| if prec == 0: | |
| # for every 4th location | |
| rNum = readText[idxNum] | |
| rLoc = '' | |
| if cnt != 0: # 1's location ignore | |
| rLoc = readTextUnit[0][cnt // 4] | |
| # rLoc = "{"+readTextUnit[0][cnt//4]+"}" | |
| res = rNum + ' ' + rLoc | |
| else: | |
| rNum = readText[idxNum] # 일, 이 ... | |
| rLoc = readTextUnit[cnt % 4] # 천, 백 ... | |
| # rLoc = "("+ readTextUnit[cnt%4] +")" # 천, 백 ... | |
| res = rNum + rLoc | |
| # Exceptions for '영' | |
| if rNum in ['영', '[영]']: | |
| if len(numbers) != 1: | |
| # if rLoc in ['{만}', '{억}', '{조}']: | |
| if rLoc in ['만', '억', '조']: | |
| cLoc = len(numbers) - cnt | |
| if numbers[cLoc - 4:cLoc] == '0000': | |
| res = '' | |
| else: | |
| res = rLoc | |
| else: | |
| res = '' | |
| else: | |
| res = rNum | |
| # Exceptions for '일' | |
| if rNum == '일': | |
| if cnt not in [12, 8, 4, 0]: | |
| res = rLoc | |
| else: | |
| if cnt == 4 and len(numbers) == 5: | |
| res = rLoc | |
| # print(res, number, prec, cnt) | |
| if res: | |
| if prec != 0: | |
| # res = '['+res+']' | |
| res = res | |
| result.insert(0, res) | |
| cnt += 1 | |
| if option1: | |
| rStr = number2readCount(numbers[-2:], option2) | |
| result[-2:] = rStr | |
| # 조/억/만 단위 띄어쓰기 | |
| outStr = " ".join(result) | |
| return outStr | |
| # outList = list(outStr) | |
| # if '조' in outList: | |
| # outList.insert(outList.index('조')+1,' ') | |
| # if '억' in outList: | |
| # outList.insert(outList.index('억')+1,' ') | |
| # if '만' in outList: | |
| # outList.insert(outList.index('만')+1,' ') | |
| # return "".join(outList) | |
| def convNumType3(match): | |
| # regex: '(\-?)(\d+)(\.)(\d+)' | |
| tStr = '[' | |
| if match.group(1): | |
| tStr += '마이너스 ' | |
| g2 = number2readText(match.group(2), 0, 0) | |
| tStr += g2 | |
| g4 = number2readNumber(match.group(4)) | |
| tStr += ' 쩜|. ' + g4 | |
| return tStr + ']' | |
| def convNumType4(match): | |
| # regex: '([\d\.]+)' | |
| tStr = '[' | |
| tNum = match.group(0).split('.') | |
| for elem in tNum: | |
| tStr += number2readNumber(elem) + " " | |
| return tStr + ']' | |
| def convNumType5(match): | |
| opt = 0 | |
| for elem in COUNT_UNIT: | |
| # pitfall '미' is in '밀리미터' ... | |
| if elem in match.group(7): | |
| opt = 1 | |
| tStr = ' ' | |
| if match.group(1): | |
| g1 = number2readText(match.group(1), opt, opt) | |
| tStr += g1 + ' ' | |
| if int(match.group(5)) - int(match.group(1)) > 1: | |
| tStr += " 에서" | |
| g5 = number2readText(match.group(5), opt, opt) | |
| tStr += ' ' + g5 + ' ' + match.group(7) | |
| return tStr | |
| def convNumType6(match): | |
| opt = 0 | |
| for elem in COUNT_UNIT: | |
| if elem in match.group(3): | |
| opt = 1 | |
| tStr = ' ' | |
| g1 = number2readText(match.group(1), opt, opt) | |
| tStr += g1 + ' ' | |
| if match.group(3): | |
| tStr += match.group(3) | |
| return tStr | |
| def convNumType9(match): | |
| tStr = ' ' | |
| # g1 = number2readText(match.group(1), 0, 0) | |
| g1 = number2readNumber(match.group(1)) | |
| tStr += g1 + ' ' | |
| return tStr | |
| def convNum_1(match): | |
| matchedTxt = match.group(0) | |
| tlist = matchedTxt.split('.') | |
| tstr = '' | |
| if len(tlist) == 3 and len(str(int(tlist[0]))) == 4: | |
| tstr += number2readText(tlist[0].strip(), 0, 0) + " 년 " | |
| tstr += number2readText(tlist[1].strip(), 0, 0) + " 월 " | |
| tstr += number2readText(tlist[2].strip(), 0, 0) + " 일" | |
| else: | |
| for elem in tlist[:-1]: | |
| tstr += number2readText(elem.strip(), 0, 0) + ' 쩜 ' | |
| tstr += number2readText(tlist[-1].strip(), 0, 0) | |
| return tstr | |
| def convNum_2(match): | |
| matchedTxt = match.group(1) | |
| tlist = matchedTxt.split('.') | |
| tstr = '' | |
| for elem in tlist[:-1]: | |
| tstr += number2readText(elem.strip(), 0, 0) + ' 쩜 ' | |
| # tstr += number2readText(tlist[-1].strip(), 0, 0) | |
| tstr += number2readNumber(tlist[-1].strip()) | |
| return tstr + " " + match.group(2) | |
| def convNum_3(match): | |
| matchedTxt = match.group(0) | |
| tlist = matchedTxt.split('.') | |
| tstr = '' | |
| for elem in tlist[:-1]: | |
| tstr += number2readText(elem.strip(), 0, 0) + ' [쩜] ' | |
| tstr += number2readNumber(tlist[-1].strip()) | |
| return tstr | |
| def convNum_4(match): | |
| matchedTxt = match.group(0) | |
| tlist = matchedTxt.split('-') | |
| tstr = '' | |
| for elem in tlist[:-1]: | |
| tstr += number2readText(elem.strip(), 0, 0) + ' ' | |
| tstr += number2readText(tlist[-1].strip(), 0, 0) | |
| return tstr | |
| def convNum_5(match): | |
| matchedTxt = match.group(1) | |
| return number2readText(matchedTxt, 1, 0) + " " | |
| # return number2readText(matchedTxt, 0, 0) + " " | |
| def convNum_6(match): | |
| matchedTxt = match.group(1) | |
| return " [쩜] " + number2readNumber(matchedTxt) | |
| # could be a number with count-unit, leave it to lexicon dictionary | |
| def convNum_7(match): | |
| return "[" + match.group(1) + "]" + match.group(2) | |
| def convNum_8(match): | |
| matchedTxt = match.group(1) | |
| tstr = number2readText(matchedTxt, 0, 0) + " " | |
| # if match.group(2): | |
| # tstr += match.group(2) | |
| return tstr | |
| def convNumType8(match): | |
| num = match.group(1) | |
| tNum = num.split(',') | |
| num = "".join(tNum) | |
| return " " + num + " " | |
| def normalize4(lines): | |
| normalized_lines = [] | |
| for line in lines: | |
| tstr = line.strip() | |
| # ----------- | |
| # numbers with ,: 123,456 --> 123456 | |
| tstr = re.sub('([^ 0-9]),([^ 0-9])', '\\1, \\2', tstr) | |
| tstr = re.sub('([0-9]),([^ 0-9])', '\\1, \\2', tstr) | |
| tstr = re.sub('([^ 0-9]),([0-9])', '\\1, \\2', tstr) | |
| # tstr = re.sub('(?=.*[0-9].*)([0-9,]+)', convNumType8, tstr) | |
| tstr = re.sub('\s([0-9][,0-9]{3,}[0-9])\s', convNumType8, tstr) | |
| # tstr = re.sub(',', ' ', tstr) | |
| # numbers with '.' | |
| tstr = re.sub('\d+\.\s*\d+(\.\s*\d+)+', convNum_1, tstr) # 2016.1.2 or 1.2.1 | |
| tstr = re.sub('(\d+\.\d+)([^ 0-9A-Za-z]+)', convNum_2, tstr) # 1.23% [일] 쩜 [이] [삼] | |
| tstr = re.sub('\d+\.\d+', convNum_3, tstr) # 1.23 [일] [쩜] [이] [삼], 3.1 운동 | |
| # tstr = re.sub('(\d+)\.', convNum_5,tstr) # 1. | |
| # tstr = re.sub('\.', ' ', tstr) | |
| tstr = re.sub('\s\.([0-9]+)', convNum_6, tstr) # .234 | |
| # numbers possively with count-unit (수량사) | |
| # leave it as a numeric | |
| # ex. 배추 1 박스 --> 배추 [1] 박스 | |
| tstr = re.sub('\\b(\d{1,2})(\s*[^ \]0-9]+)', convNum_7, tstr) | |
| # segment (just for sure) | |
| tstr = re.sub('(\S)\[', '\\1 [', tstr) | |
| tstr = re.sub('\](\S)', '] \\1', tstr) | |
| ## convert all numeric into korean Numbers if there is no surrounding brackets | |
| # words=tstr.split() | |
| # for i in range(len(words)): | |
| # if words[i][0] != '[' and words[i][-1] != ']': | |
| # words[i] = re.sub('(\d+)(\S*)', convNum_8 , words[i]) | |
| # tstr= ' '.join(words)+'\n' | |
| tstr = re.sub('(\\b\d{3,}\\b)', convNum_8, tstr) | |
| tstr = re.sub('(\\b\d{1,}\\b)', convNum_5, tstr) | |
| ## remove brackets and . , | |
| ##tstr = re.sub('[\[\]\.,]', '', tstr) | |
| tstr = re.sub('[\[\]]', '', tstr) | |
| tstr = re.sub('[\.\,\'\?\!]', ' ', tstr) | |
| # segment sentences | |
| # tstr = re.sub('\s+(['+re.escape(at_unicode.puctuations)+'])', '\\1', tstr) | |
| # tstr = re.sub('([가-힣])\s*\.', '\\1.\n', tstr) | |
| # tstr = re.sub('([가-힣])\s*([\.?!])\s*([^가-힣]+ )', '\\1\\2\n\\3', tstr) | |
| # remove repeated characters | |
| tstr = re.sub('(.)\\1{4,}', '\\1\\1\\1', tstr) | |
| tstr = re.sub(r'(\ )+', ' ', tstr).strip() | |
| normalized_lines.append(tstr) | |
| return normalized_lines | |
| def segment(match): | |
| tstr = match.group(0) | |
| tstr = re.sub('([\,\.\'\!\/])', ' ', tstr) | |
| return tstr | |
| def normalize3(lines): | |
| normalized_lines = [] | |
| for line in lines: | |
| tstr = line.strip() | |
| # numbers with '.' | |
| tstr = re.sub('(?=\S*[A-Z])(?=\S*[\.\,\'])\S*', segment, tstr) | |
| tstr = re.sub('(?=\S*[A-Z])(?=\S*[0-9])\S*', segment, tstr) | |
| tstr = re.sub(r'(\ )+', ' ', tstr).strip() | |
| normalized_lines.append(tstr) | |
| return normalized_lines | |
| def normalize2(lines): | |
| normalized_lines = [] | |
| for line in lines: | |
| tstr = line.strip() | |
| # remove meaningless start | |
| tstr = re.sub('^[^0-9a-zA-Z가-힣]+', '', tstr) | |
| # delete no-Hangul line | |
| tstr = re.sub('^([^가-힣]+)$', ' ', tstr) | |
| # ignore sentences with urls | |
| if re.search('www', tstr): continue | |
| if re.search('http', tstr): continue | |
| if re.search('ftp', tstr): continue | |
| # . | |
| tstr = re.sub('\.\s*$', '.', tstr) | |
| # ; | |
| tstr = re.sub(';', '', tstr) | |
| # ignore sentences with multi-variate pronunciation symbols | |
| # too many, need to another approach | |
| if re.search('[/\-=:~+]', tstr): continue | |
| # remove ? ! | |
| # tstr = re.sub('[!?]', ' ', tstr) | |
| tstr = re.sub('\s+([\.\,\'\!\?])', '\\1 ', tstr) | |
| # . , should be removed after treating numerics | |
| # filter sentence with [a-z] characters | |
| # not convertable into Korean now, need transliteration | |
| regexEpr = r"^[ \.,?!가-힣0-9A-Z" + re.escape(at_unicode.valids) + r"]+$" | |
| if re.match(regexEpr, tstr): | |
| tstr = re.sub(r'(\ )+', ' ', tstr).strip() | |
| normalized_lines.append(tstr) | |
| return normalized_lines | |
| def normalize1(lines): | |
| normalized_lines = [] | |
| for line in lines: | |
| tstr = line | |
| # empty line | |
| if not line.strip(): | |
| continue | |
| # separator (conventions) | |
| tstr = re.sub('[' + re.escape(at_unicode.separators) + ']', '\n', tstr) | |
| # remove bracked contents | |
| tstr = re.sub('\([^\)]+\)', '', tstr) | |
| tstr = re.sub('\[[^\]]+\]', '', tstr) | |
| tstr = re.sub('【[^】]+】', '', tstr) | |
| tstr = re.sub('\<[^\>]+\>', '', tstr) | |
| # handle apostrophe | |
| quotes = at_unicode.apostrophe + at_unicode.quatation | |
| tstr = re.sub('([a-zA-Z])[' + re.escape(quotes) + ']([a-zA-Z])', '\\1<apostrophe>\\2', tstr) | |
| tstr = re.sub('[' + re.escape(quotes) + ']', '', tstr) | |
| tstr = re.sub('<apostrophe>', '\'', tstr) | |
| # replace various percent into one | |
| tstr = re.sub('[' + re.escape(at_unicode.percents) + ']', '%', tstr) | |
| # miscellaneous | |
| tstr = re.sub('%p', '% 포인트', tstr) | |
| tstr = re.sub('±', '플러스 마이너스', tstr) | |
| tstr = re.sub('[a-zA-Z0-9_.]+@[a-zA-Z0-9_.]*', ' ', tstr) # delete e-mail | |
| # remove chinese and japanese characters | |
| tstr = re.sub(at_unicode.chinese, '', tstr) | |
| tstr = re.sub(at_unicode.japanese, '', tstr) | |
| # segment b/w Hangul and non-Hangul | |
| tstr = re.sub(r"([가-힣])([^ 가-힣])", r"\1 \2", tstr) | |
| tstr = re.sub(r"([^ 가-힣])([가-힣])", r"\1 \2", tstr) | |
| # segment b/w numerices and non-numerics | |
| tstr = re.sub('([0-9])([^ \.\,0-9])', '\\1 \\2', tstr) | |
| tstr = re.sub('([^ \+\-\.\,0-9])([0-9])', '\\1 \\2', tstr) | |
| # Leave only valid characters | |
| tstr = re.sub(at_unicode.invalids_chars, ' ', tstr) | |
| # remove repeated valid symbols | |
| tstr = re.sub('([' + re.escape(at_unicode.valids) + '])+', '\\1', tstr) | |
| # make valid symbols, except puctuations, as a unique word | |
| symbols = at_unicode.measureUnits + at_unicode.percents + at_unicode.currencies + at_unicode.userDefines | |
| regexEpr = r"([" + re.escape(symbols) + "])" | |
| tstr = re.sub(regexEpr, ' \\1 ', tstr) | |
| # remove spaces before puctuations | |
| # tstr = re.sub('\s+(['+re.escape(at_unicode.puctuations)+'])', '\\1', tstr) | |
| # segment sentences | |
| tstr = re.sub('([가-힣])\s*\.', '\\1.\n', tstr) | |
| # segment sentences 2 | |
| tstr = re.sub('([가-힣])\s*([\.?!])\s*([^가-힣]+ )', '\\1\\2\n\\3', tstr) | |
| # segment sentences 3 | |
| # / (not readable) | |
| tstr = re.sub('([가-힣])\s+[/=:]\s+([가-힣])', '\\1\n\\2', tstr) | |
| tstr = re.sub('([a-zA-Z])\s+[/=:]\s+([가-힣])', '\\1\n\\2', tstr) | |
| tstr = re.sub('([가-힣])\s+[/=:]\s+([a-zA-Z])', '\\1\n\\2', tstr) | |
| tstr = re.sub(r'(\ )+', ' ', tstr).strip() | |
| normalized_lines.append(tstr) | |
| return normalized_lines |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment