-
-
Save enamoria/e11edd8ec32863e2d83652f120c450c6 to your computer and use it in GitHub Desktop.
Chuẩn hóa cách gõ dấu câu về kiểu gõ cũ
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Copyright @ nguyenvanhieu.vn | |
Thằng code python này không giữ được lower/upper case | |
Sẽ update khi rảnh | |
""" | |
import re | |
import os | |
import sys | |
# from Logger import LogEventSourcing | |
from datetime import datetime | |
import dateutil.parser | |
import traceback | |
import time | |
import requests | |
# logger = LogEventSourcing() | |
def call_api(data, url, method, timeout=3): | |
headers = { | |
'content-type': "application/x-www-form-urlencoded", | |
'cache-control': "no-cache", | |
'postman-token': "6a410524-a8e2-79c7-bd9d-53e4b68c84c7" | |
} | |
response = requests.request(method, url, data=data, headers=headers, timeout=timeout) | |
return response | |
uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ" | |
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU" | |
def loaddicchar(): | |
dic = {} | |
char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split( | |
'|') | |
charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split( | |
'|') | |
for i in range(len(char1252)): | |
dic[char1252[i]] = charutf8[i] | |
return dic | |
dicchar = loaddicchar() | |
def convertwindown1525toutf8(txt): | |
return re.sub( | |
r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ', | |
lambda x: dicchar[x.group()], txt) | |
""" | |
Start section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey | |
Ví dụ: thủy = thuyr, tượng = tuwowngj | |
""" | |
bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'], | |
['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'], | |
['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'], | |
['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'], | |
['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'], | |
['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'], | |
['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'], | |
['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'], | |
['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'], | |
['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'], | |
['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'], | |
['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']] | |
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j'] | |
nguyen_am_to_ids = {} | |
for i in range(len(bang_nguyen_am)): | |
for j in range(len(bang_nguyen_am[i]) - 1): | |
nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j) | |
def vn_word_to_telex_type(word): | |
dau_cau = 0 | |
new_word = '' | |
for char in word: | |
x, y = nguyen_am_to_ids.get(char, (-1, -1)) | |
if x == -1: | |
new_word += char | |
continue | |
if y != 0: | |
dau_cau = y | |
new_word += bang_nguyen_am[x][-1] | |
new_word += bang_ky_tu_dau[dau_cau] | |
return new_word | |
def vn_sentence_to_telex_type(sentence): | |
""" | |
Chuyển câu tiếng việt có dấu về kiểu gõ telex. | |
:param sentence: | |
:return: | |
""" | |
words = sentence.split() | |
for index, word in enumerate(words): | |
words[index] = vn_word_to_telex_type(word) | |
return ' '.join(words) | |
""" | |
End section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey | |
""" | |
""" | |
Start section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý | |
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF | |
""" | |
def chuan_hoa_dau_tu_tieng_viet(word): | |
if not is_valid_vietnam_word(word): | |
return word | |
chars = list(word) | |
dau_cau = 0 | |
nguyen_am_index = [] | |
qu_or_gi = False | |
for index, char in enumerate(chars): | |
x, y = nguyen_am_to_ids.get(char, (-1, -1)) | |
if x == -1: | |
continue | |
elif x == 9: # check qu | |
if index != 0 and chars[index - 1] == 'q': | |
chars[index] = 'u' | |
qu_or_gi = True | |
elif x == 5: # check gi | |
if index != 0 and chars[index - 1] == 'g': | |
chars[index] = 'i' | |
qu_or_gi = True | |
if y != 0: | |
dau_cau = y | |
chars[index] = bang_nguyen_am[x][0] | |
if not qu_or_gi or index != 1: | |
nguyen_am_index.append(index) | |
if len(nguyen_am_index) < 2: | |
if qu_or_gi: | |
if len(chars) == 2: | |
x, y = nguyen_am_to_ids.get(chars[1]) | |
chars[1] = bang_nguyen_am[x][dau_cau] | |
else: | |
x, y = nguyen_am_to_ids.get(chars[2], (-1, -1)) | |
if x != -1: | |
chars[2] = bang_nguyen_am[x][dau_cau] | |
else: | |
chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau] | |
return ''.join(chars) | |
return word | |
for index in nguyen_am_index: | |
x, y = nguyen_am_to_ids[chars[index]] | |
if x == 4 or x == 8: # ê, ơ | |
chars[index] = bang_nguyen_am[x][dau_cau] | |
# for index2 in nguyen_am_index: | |
# if index2 != index: | |
# x, y = nguyen_am_to_ids[chars[index]] | |
# chars[index2] = bang_nguyen_am[x][0] | |
return ''.join(chars) | |
if len(nguyen_am_index) == 2: | |
if nguyen_am_index[-1] == len(chars) - 1: | |
x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] | |
chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau] | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] | |
# chars[nguyen_am_index[1]] = bang_nguyen_am[x][0] | |
else: | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] | |
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0] | |
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] | |
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau] | |
else: | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] | |
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0] | |
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] | |
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau] | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]] | |
# chars[nguyen_am_index[2]] = bang_nguyen_am[x][0] | |
return ''.join(chars) | |
def is_valid_vietnam_word(word): | |
chars = list(word) | |
nguyen_am_index = -1 | |
for index, char in enumerate(chars): | |
x, y = nguyen_am_to_ids.get(char, (-1, -1)) | |
if x != -1: | |
if nguyen_am_index == -1: | |
nguyen_am_index = index | |
else: | |
if index - nguyen_am_index != 1: | |
return False | |
nguyen_am_index = index | |
return True | |
def chuan_hoa_dau_cau_tieng_viet(sentence): | |
""" | |
Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ. | |
:param sentence: | |
:return: | |
""" | |
sentence = sentence.lower() | |
words = sentence.split() | |
for index, word in enumerate(words): | |
words[index] = chuan_hoa_dau_tu_tieng_viet(word) | |
return ' '.join(words) | |
""" | |
End section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý | |
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF | |
""" | |
if __name__ == '__main__': | |
# with open('C:/Users/htv/Desktop/testunicode.txt') as f: | |
# content = f.read() | |
# output = decodetounicode(content) | |
# wirtefile('C:/Users/htv/Desktop/unicode.txt', output) | |
txt = 'nếu ngày mai trời nắng' | |
# print(is_valid_vietnam_word(txt)) | |
txt = chuan_hoa_dau_cau_tieng_viet(txt) | |
print(txt) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Copyright @ nguyenvanhieu.vn | |
Thằng code java này vẫn giữ được lower/upper case | |
Code này ngon hơn, check đúng trường hợp cần thêm dấu thì mới thêm | |
Tuy nhiên code python ở dưới không check nhưng vẫn chưa thấy bugs nào hết :v | |
*/ | |
package utils; | |
import java.util.*; | |
public class NlpUtils { | |
static Map<String, String> dictChar; | |
static Character[][] vowelTable = { | |
{'a', 'à', 'á', 'ả', 'ã', 'ạ'}, | |
{'ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ'}, | |
{'â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ'}, | |
{'e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ'}, | |
{'ê', 'ề', 'ế', 'ể', 'ễ', 'ệ'}, | |
{'i', 'ì', 'í', 'ỉ', 'ĩ', 'ị'}, | |
{'o', 'ò', 'ó', 'ỏ', 'õ', 'ọ'}, | |
{'ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ'}, | |
{'ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ'}, | |
{'u', 'ù', 'ú', 'ủ', 'ũ', 'ụ'}, | |
{'ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự'}, | |
{'y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ'} | |
}; | |
static Set<Character> vietnamChars; | |
static Map<Character, Integer> vowelLookupRow = new HashMap<>(); | |
static Map<Character, Integer> vowelLookupColumn = new HashMap<>(); | |
static { | |
dictChar = loadDictChar(); | |
for (int i = 0; i < vowelTable.length; i++) { | |
for (int j = 0; j < vowelTable[i].length; j++) { | |
vowelLookupRow.put(vowelTable[i][j], i); | |
vowelLookupColumn.put(vowelTable[i][j], j); | |
} | |
} | |
vietnamChars = new HashSet<>(Arrays.asList('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', | |
'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', | |
'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'à', 'á', 'ả', 'ã', | |
'ạ', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'ề', 'ế', 'ể', 'ễ', 'ệ', | |
'ì', 'í', 'ỉ', 'ĩ', 'ị', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ù', | |
'ú', 'ủ', 'ũ', 'ụ', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'À', 'Á', 'Ả', 'Ã', 'Ạ', 'Ầ', 'Ấ', | |
'Ẩ', 'Ẫ', 'Ậ', 'Ằ', 'Ắ', 'Ẳ', 'Ẵ', 'Ặ', 'È', 'É', 'Ẻ', 'Ẽ', 'Ẹ', 'Ề', 'Ế', 'Ể', 'Ễ', 'Ệ', 'Ì', 'Í', 'Ỉ', | |
'Ĩ', 'Ị', 'Ò', 'Ó', 'Ỏ', 'Õ', 'Ọ', 'Ô', 'Ồ', 'Ố', 'Ổ', 'Ỗ', 'Ộ', 'Ờ', 'Ớ', 'Ở', 'Ỡ', 'Ợ', 'Ù', 'Ú', 'Ủ', 'Ũ', | |
'Ụ', 'Ừ', 'Ứ', 'Ử', 'Ữ', 'Ự', 'Ỳ', 'Ý', 'Ỷ', 'Ỹ', 'Ỵ', 'đ', 'Đ', 'ă', 'Ă', 'â', 'Â', 'ê', 'Ê', 'ô', 'Ô', 'ơ', 'Ơ', 'ư', 'Ư')); | |
} | |
private static Map<String, String> loadDictChar() { | |
String[] char1252 = ("à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|" + | |
"ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ" + | |
"|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ").split("\\|"); | |
String[] charUTF8 = ("à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|" + | |
"ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|" + | |
"Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ").split("\\|"); | |
Map<String, String> dictChar = new HashMap<>(); | |
for (int i = 0; i < char1252.length; i++) { | |
dictChar.put(char1252[i], charUTF8[i]); | |
} | |
return dictChar; | |
} | |
public static String convertChar1252ToUtf8(String sentence) { | |
for (String key : dictChar.keySet()) { | |
sentence = sentence.replaceAll(key, dictChar.get(key)); | |
} | |
return sentence; | |
} | |
private static boolean isVietnamWord(String word) { | |
/* | |
* Kiểm tra có phải là từ tiếng việt, có dấu | |
* Input word cần lowerCase nhé | |
* */ | |
boolean hasAccent = false; | |
int currentVowel = -1; | |
for (int i = 0; i < word.length(); i++) { | |
if (!vietnamChars.contains(word.charAt(i))) return false; | |
if (vowelLookupRow.containsKey(word.charAt(i))) { | |
if (currentVowel == -1) | |
currentVowel = i; | |
else { | |
if (i - currentVowel != 1) return false; | |
currentVowel = i; | |
} | |
if (vowelLookupColumn.get(word.charAt(i)) > 0) { | |
if (hasAccent) return false; // Một từ có hai thanh dấu | |
hasAccent = true; | |
} | |
} | |
} | |
return hasAccent; | |
} | |
private static String correctVnAccentWord(String word) { | |
word = word.toLowerCase(); | |
if (!isVietnamWord(word)) return word; | |
char[] chars = word.toCharArray(); | |
int accentPosition = 0, x, y; | |
boolean isQuOrGi = false; | |
List<Integer> vowelsIndex = new ArrayList<>(); | |
for (int i = 0; i < chars.length; i++) { | |
x = vowelLookupRow.getOrDefault(chars[i], -1); | |
y = vowelLookupColumn.getOrDefault(chars[i], -1); | |
if (x == -1) continue; | |
else if (x == 9) { // qu | |
if (i != 0 && chars[i - 1] == 'q') { | |
chars[i] = 'u'; | |
isQuOrGi = true; | |
} | |
} else if (x == 5) { // gi | |
if (i != 0 && chars[i - 1] == 'g') { | |
chars[i] = 'i'; | |
isQuOrGi = true; | |
} | |
} | |
if (y != 0) { | |
accentPosition = y; | |
chars[i] = vowelTable[x][0]; | |
} | |
if (!isQuOrGi || i != 1) { | |
vowelsIndex.add(i); | |
} | |
} | |
if (vowelsIndex.size() < 2) { | |
if (isQuOrGi) { | |
if (chars.length == 2) { | |
x = vowelLookupRow.get(chars[1]); | |
chars[1] = vowelTable[x][accentPosition]; | |
} else { | |
x = vowelLookupRow.getOrDefault(chars[2], -1); | |
if (x != -1) { | |
chars[2] = vowelTable[x][accentPosition]; | |
} else { | |
chars[1] = (chars[1] == 'i' ? vowelTable[5][accentPosition] : vowelTable[9][accentPosition]); | |
} | |
} | |
return String.copyValueOf(chars); | |
} | |
return word; | |
} | |
for (int index : vowelsIndex) { | |
x = vowelLookupRow.get(chars[index]); | |
if (x == 4 || x == 8) { // ê, ơ | |
chars[index] = vowelTable[x][accentPosition]; | |
return String.copyValueOf(chars); | |
} | |
} | |
if (vowelsIndex.size() == 2) { | |
if (vowelsIndex.get(vowelsIndex.size() - 1) == chars.length - 1) { | |
x = vowelLookupRow.get(chars[vowelsIndex.get(0)]); | |
chars[vowelsIndex.get(0)] = vowelTable[x][accentPosition]; | |
} else { | |
x = vowelLookupRow.get(chars[vowelsIndex.get(1)]); | |
chars[vowelsIndex.get(1)] = vowelTable[x][accentPosition]; | |
} | |
} else { | |
x = vowelLookupRow.get(chars[vowelsIndex.get(1)]); | |
chars[vowelsIndex.get(1)] = vowelTable[x][accentPosition]; | |
} | |
return String.copyValueOf(chars); | |
} | |
private static List<Boolean> getUpperState(String word) { | |
List<Boolean> uppers = new ArrayList<>(); | |
for (char c : word.toCharArray()) { | |
uppers.add(Character.isUpperCase(c) ? true : false); | |
} | |
return uppers; | |
} | |
private static String updateUpperState(String word, List<Boolean> uppers) { | |
char[] chars = word.toCharArray(); | |
for (int i = 0; i < chars.length; i++) { | |
chars[i] = uppers.get(i) ? Character.toUpperCase(chars[i]) : chars[i]; | |
} | |
return String.copyValueOf(chars); | |
} | |
public static String correctVnAccentSentence(String sentence) { | |
String[] words = sentence.split("\\s+"); | |
for (int i = 0; i < words.length; i++) { | |
List<Boolean> uppers = getUpperState(words[i]); | |
words[i] = updateUpperState(correctVnAccentWord(words[i]), uppers); | |
} | |
return String.join(" ", words); | |
} | |
public static void main(String[] args) { | |
System.out.println(NlpUtils.correctVnAccentSentence("chuôí")); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Code java lỗi trong trường hợp này: "İlkay". Lý do lỗi là khi lowerCase thì toCharArray nó sẽ thêm 1 kí tự ' thành ra có 6 kí tự, còn nếu giữ nguyên hoặc upperCase sẽ chỉ có 5 kí tự