Chuẩn hóa cách gõ dấu câu về kiểu gõ cũ (Python + Java version)
# -*- coding: utf-8 -*- | |
import regex as re | |
uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ" | |
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU" | |
def loaddicchar(): | |
dic = {} | |
char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split( | |
'|') | |
charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split( | |
'|') | |
for i in range(len(char1252)): | |
dic[char1252[i]] = charutf8[i] | |
return dic | |
dicchar = loaddicchar() | |
def convert_unicode(txt): | |
return re.sub( | |
r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ', | |
lambda x: dicchar[x.group()], txt) | |
""" | |
Start section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey | |
Ví dụ: thủy = thuyr, tượng = tuwowngj | |
""" | |
bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'], | |
['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'], | |
['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'], | |
['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'], | |
['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'], | |
['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'], | |
['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'], | |
['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'], | |
['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'], | |
['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'], | |
['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'], | |
['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']] | |
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j'] | |
nguyen_am_to_ids = {} | |
for i in range(len(bang_nguyen_am)): | |
for j in range(len(bang_nguyen_am[i]) - 1): | |
nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j) | |
def vn_word_to_telex_type(word): | |
dau_cau = 0 | |
new_word = '' | |
for char in word: | |
x, y = nguyen_am_to_ids.get(char, (-1, -1)) | |
if x == -1: | |
new_word += char | |
continue | |
if y != 0: | |
dau_cau = y | |
new_word += bang_nguyen_am[x][-1] | |
new_word += bang_ky_tu_dau[dau_cau] | |
return new_word | |
def vn_sentence_to_telex_type(sentence): | |
""" | |
Chuyển câu tiếng việt có dấu về kiểu gõ telex. | |
:param sentence: | |
:return: | |
""" | |
words = sentence.split() | |
for index, word in enumerate(words): | |
words[index] = vn_word_to_telex_type(word) | |
return ' '.join(words) | |
""" | |
End section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey | |
""" | |
""" | |
Start section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý | |
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF | |
""" | |
def chuan_hoa_dau_tu_tieng_viet(word): | |
if not is_valid_vietnam_word(word): | |
return word | |
chars = list(word) | |
dau_cau = 0 | |
nguyen_am_index = [] | |
qu_or_gi = False | |
for index, char in enumerate(chars): | |
x, y = nguyen_am_to_ids.get(char, (-1, -1)) | |
if x == -1: | |
continue | |
elif x == 9: # check qu | |
if index != 0 and chars[index - 1] == 'q': | |
chars[index] = 'u' | |
qu_or_gi = True | |
elif x == 5: # check gi | |
if index != 0 and chars[index - 1] == 'g': | |
chars[index] = 'i' | |
qu_or_gi = True | |
if y != 0: | |
dau_cau = y | |
chars[index] = bang_nguyen_am[x][0] | |
if not qu_or_gi or index != 1: | |
nguyen_am_index.append(index) | |
if len(nguyen_am_index) < 2: | |
if qu_or_gi: | |
if len(chars) == 2: | |
x, y = nguyen_am_to_ids.get(chars[1]) | |
chars[1] = bang_nguyen_am[x][dau_cau] | |
else: | |
x, y = nguyen_am_to_ids.get(chars[2], (-1, -1)) | |
if x != -1: | |
chars[2] = bang_nguyen_am[x][dau_cau] | |
else: | |
chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau] | |
return ''.join(chars) | |
return word | |
for index in nguyen_am_index: | |
x, y = nguyen_am_to_ids[chars[index]] | |
if x == 4 or x == 8: # ê, ơ | |
chars[index] = bang_nguyen_am[x][dau_cau] | |
# for index2 in nguyen_am_index: | |
# if index2 != index: | |
# x, y = nguyen_am_to_ids[chars[index]] | |
# chars[index2] = bang_nguyen_am[x][0] | |
return ''.join(chars) | |
if len(nguyen_am_index) == 2: | |
if nguyen_am_index[-1] == len(chars) - 1: | |
x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] | |
chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau] | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] | |
# chars[nguyen_am_index[1]] = bang_nguyen_am[x][0] | |
else: | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] | |
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0] | |
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] | |
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau] | |
else: | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] | |
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0] | |
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] | |
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau] | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]] | |
# chars[nguyen_am_index[2]] = bang_nguyen_am[x][0] | |
return ''.join(chars) | |
def is_valid_vietnam_word(word): | |
chars = list(word) | |
nguyen_am_index = -1 | |
for index, char in enumerate(chars): | |
x, y = nguyen_am_to_ids.get(char, (-1, -1)) | |
if x != -1: | |
if nguyen_am_index == -1: | |
nguyen_am_index = index | |
else: | |
if index - nguyen_am_index != 1: | |
return False | |
nguyen_am_index = index | |
return True | |
def chuan_hoa_dau_cau_tieng_viet(sentence): | |
""" | |
Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ. | |
:param sentence: | |
:return: | |
""" | |
sentence = sentence.lower() | |
words = sentence.split() | |
for index, word in enumerate(words): | |
cw = re.sub(r'(^\p{P}*)([p{L}.]*\p{L}+)(\p{P}*$)', r'\1/\2/\3', word).split('/') | |
# print(cw) | |
if len(cw) == 3: | |
cw[1] = chuan_hoa_dau_tu_tieng_viet(cw[1]) | |
words[index] = ''.join(cw) | |
return ' '.join(words) | |
""" | |
End section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý | |
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_tắc_đặt_dấu_thanh_trong_chữ_quốc_ngữ | |
""" | |
if __name__ == '__main__': | |
print(chuan_hoa_dau_cau_tieng_viet('anh hoà, đang làm.. gì')) | |
# f = open('/home/lap60313/data/corpus-full.txt', encoding='utf8') | |
# sentence = f.readline() | |
# current_line = 0 | |
# while sentence: | |
# current_line += 1 | |
# if current_line % 1000 == 0: | |
# print('Current line', str(current_line)) | |
# sentence = sentence.lower().strip() | |
# sentence = convertwindown1525toutf8(sentence) | |
# sentence = chuan_hoa_dau_cau_tieng_viet(sentence) | |
# with open('/home/lap60313/data/corpus-full.txt.out', 'a+', encoding='utf8') as fp: | |
# fp.write(sentence + "\n") | |
# sentence = f.readline() |
package utils; | |
import java.util.*; | |
public class NlpUtils { | |
static Map<String, String> dictChar; | |
static Character[][] vowelTable = { | |
{'a', 'à', 'á', 'ả', 'ã', 'ạ'}, | |
{'ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ'}, | |
{'â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ'}, | |
{'e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ'}, | |
{'ê', 'ề', 'ế', 'ể', 'ễ', 'ệ'}, | |
{'i', 'ì', 'í', 'ỉ', 'ĩ', 'ị'}, | |
{'o', 'ò', 'ó', 'ỏ', 'õ', 'ọ'}, | |
{'ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ'}, | |
{'ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ'}, | |
{'u', 'ù', 'ú', 'ủ', 'ũ', 'ụ'}, | |
{'ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự'}, | |
{'y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ'} | |
}; | |
static Set<Character> vietnamChars; | |
static Map<Character, Integer> vowelLookupRow = new HashMap<>(); | |
static Map<Character, Integer> vowelLookupColumn = new HashMap<>(); | |
static { | |
dictChar = loadDictChar(); | |
for (int i = 0; i < vowelTable.length; i++) { | |
for (int j = 0; j < vowelTable[i].length; j++) { | |
vowelLookupRow.put(vowelTable[i][j], i); | |
vowelLookupColumn.put(vowelTable[i][j], j); | |
} | |
} | |
vietnamChars = new HashSet<>(Arrays.asList('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', | |
'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', | |
'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'à', 'á', 'ả', 'ã', | |
'ạ', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'ề', 'ế', 'ể', 'ễ', 'ệ', | |
'ì', 'í', 'ỉ', 'ĩ', 'ị', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ù', | |
'ú', 'ủ', 'ũ', 'ụ', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'À', 'Á', 'Ả', 'Ã', 'Ạ', 'Ầ', 'Ấ', | |
'Ẩ', 'Ẫ', 'Ậ', 'Ằ', 'Ắ', 'Ẳ', 'Ẵ', 'Ặ', 'È', 'É', 'Ẻ', 'Ẽ', 'Ẹ', 'Ề', 'Ế', 'Ể', 'Ễ', 'Ệ', 'Ì', 'Í', 'Ỉ', | |
'Ĩ', 'Ị', 'Ò', 'Ó', 'Ỏ', 'Õ', 'Ọ', 'Ô', 'Ồ', 'Ố', 'Ổ', 'Ỗ', 'Ộ', 'Ờ', 'Ớ', 'Ở', 'Ỡ', 'Ợ', 'Ù', 'Ú', 'Ủ', 'Ũ', | |
'Ụ', 'Ừ', 'Ứ', 'Ử', 'Ữ', 'Ự', 'Ỳ', 'Ý', 'Ỷ', 'Ỹ', 'Ỵ', 'đ', 'Đ', 'ă', 'Ă', 'â', 'Â', 'ê', 'Ê', 'ô', 'Ô', 'ơ', 'Ơ', 'ư', 'Ư')); | |
} | |
private static Map<String, String> loadDictChar() { | |
String[] char1252 = ("à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|" + | |
"ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ" + | |
"|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ|Ð").split("\\|"); | |
String[] charUTF8 = ("à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|" + | |
"ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|" + | |
"Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ|Đ").split("\\|"); | |
Map<String, String> dictChar = new HashMap<>(); | |
for (int i = 0; i < char1252.length; i++) { | |
dictChar.put(char1252[i], charUTF8[i]); | |
} | |
return dictChar; | |
} | |
public static String convertUnicde(String sentence) { | |
for (String key : dictChar.keySet()) { | |
sentence = sentence.replaceAll(key, dictChar.get(key)); | |
} | |
return sentence; | |
} | |
private static boolean isVietnamWord(String word) { | |
/* | |
* Kiểm tra có phải là từ tiếng việt, có dấu | |
* Input word cần lowerCase nhé | |
* */ | |
boolean hasAccent = false; | |
int currentVowel = -1; | |
for (int i = 0; i < word.length(); i++) { | |
if (!vietnamChars.contains(word.charAt(i))) return false; | |
if (vowelLookupRow.containsKey(word.charAt(i))) { | |
if (currentVowel == -1) | |
currentVowel = i; | |
else { | |
if (i - currentVowel != 1) return false; | |
currentVowel = i; | |
} | |
if (vowelLookupColumn.get(word.charAt(i)) > 0) { | |
if (hasAccent) return false; // Một từ có hai thanh dấu | |
hasAccent = true; | |
} | |
} | |
} | |
return hasAccent; | |
} | |
private static String correctVnAccentWord(String word) { | |
// Tách head tail char | |
if (!word.matches(".*\\p{L}+.*")) { | |
return word; | |
} | |
word = word.replaceAll("^([^\\p{L}]*)([\\p{L}]+)([^\\p{L}]*)$", "$1 $2 $3").trim(); | |
String[] parts = word.split("\\s+"); | |
String head = "", tWord, tail = ""; | |
if (parts.length == 1) { | |
word = parts[0]; | |
} else if (parts.length == 2) { | |
if (parts[0].matches("\\p{L}+")) { | |
word = parts[0]; | |
tail = parts[1]; | |
} else { | |
head = parts[0]; | |
word = parts[1]; | |
} | |
} else { | |
head = parts[0]; | |
word = parts[1]; | |
tail = parts[2]; | |
} | |
word = word.toLowerCase(); | |
if (!isVietnamWord(word)) return head + word + tail; | |
char[] chars = word.toCharArray(); | |
int accentPosition = 0, x, y; | |
boolean isQuOrGi = false; | |
List<Integer> vowelsIndex = new ArrayList<>(); | |
for (int i = 0; i < chars.length; i++) { | |
x = vowelLookupRow.getOrDefault(chars[i], -1); | |
y = vowelLookupColumn.getOrDefault(chars[i], -1); | |
if (x == -1) continue; | |
else if (x == 9) { // qu | |
if (i != 0 && chars[i - 1] == 'q') { | |
chars[i] = 'u'; | |
isQuOrGi = true; | |
} | |
} else if (x == 5) { // gi | |
if (i != 0 && chars[i - 1] == 'g') { | |
chars[i] = 'i'; | |
isQuOrGi = true; | |
} | |
} | |
if (y != 0) { | |
accentPosition = y; | |
chars[i] = vowelTable[x][0]; | |
} | |
if (!isQuOrGi || i != 1) { | |
vowelsIndex.add(i); | |
} | |
} | |
if (vowelsIndex.size() < 2) { | |
if (isQuOrGi) { | |
if (chars.length == 2) { | |
x = vowelLookupRow.get(chars[1]); | |
chars[1] = vowelTable[x][accentPosition]; | |
} else { | |
x = vowelLookupRow.getOrDefault(chars[2], -1); | |
if (x != -1) { | |
chars[2] = vowelTable[x][accentPosition]; | |
} else { | |
chars[1] = (chars[1] == 'i' ? vowelTable[5][accentPosition] : vowelTable[9][accentPosition]); | |
} | |
} | |
return head + String.copyValueOf(chars) + tail; | |
} | |
return head + word + tail; | |
} | |
for (int index : vowelsIndex) { | |
x = vowelLookupRow.get(chars[index]); | |
if (x == 4 || x == 8) { // ê, ơ | |
chars[index] = vowelTable[x][accentPosition]; | |
return head + String.copyValueOf(chars) + tail; | |
} | |
} | |
if (vowelsIndex.size() == 2) { | |
if (vowelsIndex.get(vowelsIndex.size() - 1) == chars.length - 1) { | |
x = vowelLookupRow.get(chars[vowelsIndex.get(0)]); | |
chars[vowelsIndex.get(0)] = vowelTable[x][accentPosition]; | |
} else { | |
x = vowelLookupRow.get(chars[vowelsIndex.get(1)]); | |
chars[vowelsIndex.get(1)] = vowelTable[x][accentPosition]; | |
} | |
} else { | |
x = vowelLookupRow.get(chars[vowelsIndex.get(1)]); | |
chars[vowelsIndex.get(1)] = vowelTable[x][accentPosition]; | |
} | |
return head + String.copyValueOf(chars) + tail; | |
} | |
private static List<Boolean> getUpperState(String word) { | |
List<Boolean> uppers = new ArrayList<>(); | |
char[] chars = word.toCharArray(); | |
for (int i = 0; i < chars.length; i++) { | |
uppers.add(Character.isUpperCase(chars[i]) ? true : false); | |
} | |
return uppers; | |
} | |
private static String updateUpperState(String word, List<Boolean> uppers) { | |
char[] chars = word.toCharArray(); | |
for (int i = 0; i < chars.length; i++) { | |
chars[i] = uppers.get(i) ? Character.toUpperCase(chars[i]) : chars[i]; | |
} | |
return String.copyValueOf(chars); | |
} | |
public static String correctVnAccentSentence(String sentence) { | |
sentence = convertChar1252ToUtf8(sentence); | |
String[] words = sentence.split("\\s+"); | |
for (int i = 0; i < words.length; i++) { | |
List<Boolean> uppers = getUpperState(words[i]); | |
try { | |
words[i] = updateUpperState(correctVnAccentWord(words[i]), uppers); | |
} catch (Exception e) { | |
} | |
} | |
return String.join(" ", words); | |
} | |
private static String addExtraSpace(String sent) { | |
sent = sent.replaceAll("([\"',.:'!?/”“\\(])(\\p{L})", "$1 $2") | |
.replaceAll("(\\p{L})([\"',.:'!?/”“\\)])", "$1 $2"); | |
return sent; | |
} | |
private static String removeExtraSpace(String sent) { | |
sent = sent.replaceAll("([/“\\(])\\s+(\\p{L})", "$1$2") | |
.replaceAll("(\\p{L})\\s+([,.:!?/\\)”])", "$1$2"); | |
return sent; | |
} | |
public static void main(String[] args) { | |
System.out.println(NlpUtils.correctVnAccentSentence("Cái kiểu so sánh quận Nhất, U Minh gây tranh cãi vừa rồi, khiến dân chơi thể thao nhớ tới một người: Jose Mourinho.")); | |
} | |
} |
This comment has been minimized.
This comment has been minimized.
Cập nhật phiên bản code Python, sửa lỗi chuẩn hóa dấu với các từ dính dấu ngắt câu: |
This comment has been minimized.
This comment has been minimized.
Cảm ơn nguyenvanhieuvn, cái này sẽ giúp mình rất nhiều |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
13/02/2020: Java version, Bổ sung chuẩn hóa dấu cho các từ đi kèm ký tự đặc biệt nhưng vẫn giữ nguyên các ký tự này, ví dụ: (thuỳ) => (thùy)