Skip to content

Instantly share code, notes, and snippets.

@enamoria
Forked from behitek/NlpUtils.java
Created October 10, 2019 09:13
Show Gist options
  • Save enamoria/e11edd8ec32863e2d83652f120c450c6 to your computer and use it in GitHub Desktop.
Save enamoria/e11edd8ec32863e2d83652f120c450c6 to your computer and use it in GitHub Desktop.
Chuẩn hóa cách gõ dấu câu về kiểu gõ cũ
"""
Copyright @ nguyenvanhieu.vn
Thằng code python này không giữ được lower/upper case
Sẽ update khi rảnh
"""
import re
import os
import sys
# from Logger import LogEventSourcing
from datetime import datetime
import dateutil.parser
import traceback
import time
import requests
# logger = LogEventSourcing()
def call_api(data, url, method, timeout=3):
headers = {
'content-type': "application/x-www-form-urlencoded",
'cache-control': "no-cache",
'postman-token': "6a410524-a8e2-79c7-bd9d-53e4b68c84c7"
}
response = requests.request(method, url, data=data, headers=headers, timeout=timeout)
return response
uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"
def loaddicchar():
dic = {}
char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
'|')
charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
'|')
for i in range(len(char1252)):
dic[char1252[i]] = charutf8[i]
return dic
dicchar = loaddicchar()
def convertwindown1525toutf8(txt):
return re.sub(
r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
lambda x: dicchar[x.group()], txt)
"""
Start section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey
Ví dụ: thủy = thuyr, tượng = tuwowngj
"""
bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']
nguyen_am_to_ids = {}
for i in range(len(bang_nguyen_am)):
for j in range(len(bang_nguyen_am[i]) - 1):
nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)
def vn_word_to_telex_type(word):
dau_cau = 0
new_word = ''
for char in word:
x, y = nguyen_am_to_ids.get(char, (-1, -1))
if x == -1:
new_word += char
continue
if y != 0:
dau_cau = y
new_word += bang_nguyen_am[x][-1]
new_word += bang_ky_tu_dau[dau_cau]
return new_word
def vn_sentence_to_telex_type(sentence):
"""
Chuyển câu tiếng việt có dấu về kiểu gõ telex.
:param sentence:
:return:
"""
words = sentence.split()
for index, word in enumerate(words):
words[index] = vn_word_to_telex_type(word)
return ' '.join(words)
"""
End section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey
"""
"""
Start section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF
"""
def chuan_hoa_dau_tu_tieng_viet(word):
if not is_valid_vietnam_word(word):
return word
chars = list(word)
dau_cau = 0
nguyen_am_index = []
qu_or_gi = False
for index, char in enumerate(chars):
x, y = nguyen_am_to_ids.get(char, (-1, -1))
if x == -1:
continue
elif x == 9: # check qu
if index != 0 and chars[index - 1] == 'q':
chars[index] = 'u'
qu_or_gi = True
elif x == 5: # check gi
if index != 0 and chars[index - 1] == 'g':
chars[index] = 'i'
qu_or_gi = True
if y != 0:
dau_cau = y
chars[index] = bang_nguyen_am[x][0]
if not qu_or_gi or index != 1:
nguyen_am_index.append(index)
if len(nguyen_am_index) < 2:
if qu_or_gi:
if len(chars) == 2:
x, y = nguyen_am_to_ids.get(chars[1])
chars[1] = bang_nguyen_am[x][dau_cau]
else:
x, y = nguyen_am_to_ids.get(chars[2], (-1, -1))
if x != -1:
chars[2] = bang_nguyen_am[x][dau_cau]
else:
chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau]
return ''.join(chars)
return word
for index in nguyen_am_index:
x, y = nguyen_am_to_ids[chars[index]]
if x == 4 or x == 8: # ê, ơ
chars[index] = bang_nguyen_am[x][dau_cau]
# for index2 in nguyen_am_index:
# if index2 != index:
# x, y = nguyen_am_to_ids[chars[index]]
# chars[index2] = bang_nguyen_am[x][0]
return ''.join(chars)
if len(nguyen_am_index) == 2:
if nguyen_am_index[-1] == len(chars) - 1:
x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau]
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
# chars[nguyen_am_index[1]] = bang_nguyen_am[x][0]
else:
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
else:
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]]
# chars[nguyen_am_index[2]] = bang_nguyen_am[x][0]
return ''.join(chars)
def is_valid_vietnam_word(word):
chars = list(word)
nguyen_am_index = -1
for index, char in enumerate(chars):
x, y = nguyen_am_to_ids.get(char, (-1, -1))
if x != -1:
if nguyen_am_index == -1:
nguyen_am_index = index
else:
if index - nguyen_am_index != 1:
return False
nguyen_am_index = index
return True
def chuan_hoa_dau_cau_tieng_viet(sentence):
"""
Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ.
:param sentence:
:return:
"""
sentence = sentence.lower()
words = sentence.split()
for index, word in enumerate(words):
words[index] = chuan_hoa_dau_tu_tieng_viet(word)
return ' '.join(words)
"""
End section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF
"""
if __name__ == '__main__':
# with open('C:/Users/htv/Desktop/testunicode.txt') as f:
# content = f.read()
# output = decodetounicode(content)
# wirtefile('C:/Users/htv/Desktop/unicode.txt', output)
txt = 'nếu ngày mai trời nắng'
# print(is_valid_vietnam_word(txt))
txt = chuan_hoa_dau_cau_tieng_viet(txt)
print(txt)
/*
Copyright @ nguyenvanhieu.vn
Thằng code java này vẫn giữ được lower/upper case
Code này ngon hơn, check đúng trường hợp cần thêm dấu thì mới thêm
Tuy nhiên code python ở dưới không check nhưng vẫn chưa thấy bugs nào hết :v
*/
package utils;
import java.util.*;
public class NlpUtils {
static Map<String, String> dictChar;
static Character[][] vowelTable = {
{'a', 'à', 'á', 'ả', 'ã', 'ạ'},
{'ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ'},
{'â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ'},
{'e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ'},
{'ê', 'ề', 'ế', 'ể', 'ễ', 'ệ'},
{'i', 'ì', 'í', 'ỉ', 'ĩ', 'ị'},
{'o', 'ò', 'ó', 'ỏ', 'õ', 'ọ'},
{'ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ'},
{'ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ'},
{'u', 'ù', 'ú', 'ủ', 'ũ', 'ụ'},
{'ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự'},
{'y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ'}
};
static Set<Character> vietnamChars;
static Map<Character, Integer> vowelLookupRow = new HashMap<>();
static Map<Character, Integer> vowelLookupColumn = new HashMap<>();
static {
dictChar = loadDictChar();
for (int i = 0; i < vowelTable.length; i++) {
for (int j = 0; j < vowelTable[i].length; j++) {
vowelLookupRow.put(vowelTable[i][j], i);
vowelLookupColumn.put(vowelTable[i][j], j);
}
}
vietnamChars = new HashSet<>(Arrays.asList('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'à', 'á', 'ả', 'ã',
'ạ', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'ề', 'ế', 'ể', 'ễ', 'ệ',
'ì', 'í', 'ỉ', 'ĩ', 'ị', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ù',
'ú', 'ủ', 'ũ', 'ụ', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'À', 'Á', 'Ả', 'Ã', 'Ạ', 'Ầ', 'Ấ',
'Ẩ', 'Ẫ', 'Ậ', 'Ằ', 'Ắ', 'Ẳ', 'Ẵ', 'Ặ', 'È', 'É', 'Ẻ', 'Ẽ', 'Ẹ', 'Ề', 'Ế', 'Ể', 'Ễ', 'Ệ', 'Ì', 'Í', 'Ỉ',
'Ĩ', 'Ị', 'Ò', 'Ó', 'Ỏ', 'Õ', 'Ọ', 'Ô', 'Ồ', 'Ố', 'Ổ', 'Ỗ', 'Ộ', 'Ờ', 'Ớ', 'Ở', 'Ỡ', 'Ợ', 'Ù', 'Ú', 'Ủ', 'Ũ',
'Ụ', 'Ừ', 'Ứ', 'Ử', 'Ữ', 'Ự', 'Ỳ', 'Ý', 'Ỷ', 'Ỹ', 'Ỵ', 'đ', 'Đ', 'ă', 'Ă', 'â', 'Â', 'ê', 'Ê', 'ô', 'Ô', 'ơ', 'Ơ', 'ư', 'Ư'));
}
private static Map<String, String> loadDictChar() {
String[] char1252 = ("à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|" +
"ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ" +
"|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ").split("\\|");
String[] charUTF8 = ("à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|" +
"ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|" +
"Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ").split("\\|");
Map<String, String> dictChar = new HashMap<>();
for (int i = 0; i < char1252.length; i++) {
dictChar.put(char1252[i], charUTF8[i]);
}
return dictChar;
}
public static String convertChar1252ToUtf8(String sentence) {
for (String key : dictChar.keySet()) {
sentence = sentence.replaceAll(key, dictChar.get(key));
}
return sentence;
}
private static boolean isVietnamWord(String word) {
/*
* Kiểm tra có phải là từ tiếng việt, có dấu
* Input word cần lowerCase nhé
* */
boolean hasAccent = false;
int currentVowel = -1;
for (int i = 0; i < word.length(); i++) {
if (!vietnamChars.contains(word.charAt(i))) return false;
if (vowelLookupRow.containsKey(word.charAt(i))) {
if (currentVowel == -1)
currentVowel = i;
else {
if (i - currentVowel != 1) return false;
currentVowel = i;
}
if (vowelLookupColumn.get(word.charAt(i)) > 0) {
if (hasAccent) return false; // Một từ có hai thanh dấu
hasAccent = true;
}
}
}
return hasAccent;
}
private static String correctVnAccentWord(String word) {
word = word.toLowerCase();
if (!isVietnamWord(word)) return word;
char[] chars = word.toCharArray();
int accentPosition = 0, x, y;
boolean isQuOrGi = false;
List<Integer> vowelsIndex = new ArrayList<>();
for (int i = 0; i < chars.length; i++) {
x = vowelLookupRow.getOrDefault(chars[i], -1);
y = vowelLookupColumn.getOrDefault(chars[i], -1);
if (x == -1) continue;
else if (x == 9) { // qu
if (i != 0 && chars[i - 1] == 'q') {
chars[i] = 'u';
isQuOrGi = true;
}
} else if (x == 5) { // gi
if (i != 0 && chars[i - 1] == 'g') {
chars[i] = 'i';
isQuOrGi = true;
}
}
if (y != 0) {
accentPosition = y;
chars[i] = vowelTable[x][0];
}
if (!isQuOrGi || i != 1) {
vowelsIndex.add(i);
}
}
if (vowelsIndex.size() < 2) {
if (isQuOrGi) {
if (chars.length == 2) {
x = vowelLookupRow.get(chars[1]);
chars[1] = vowelTable[x][accentPosition];
} else {
x = vowelLookupRow.getOrDefault(chars[2], -1);
if (x != -1) {
chars[2] = vowelTable[x][accentPosition];
} else {
chars[1] = (chars[1] == 'i' ? vowelTable[5][accentPosition] : vowelTable[9][accentPosition]);
}
}
return String.copyValueOf(chars);
}
return word;
}
for (int index : vowelsIndex) {
x = vowelLookupRow.get(chars[index]);
if (x == 4 || x == 8) { // ê, ơ
chars[index] = vowelTable[x][accentPosition];
return String.copyValueOf(chars);
}
}
if (vowelsIndex.size() == 2) {
if (vowelsIndex.get(vowelsIndex.size() - 1) == chars.length - 1) {
x = vowelLookupRow.get(chars[vowelsIndex.get(0)]);
chars[vowelsIndex.get(0)] = vowelTable[x][accentPosition];
} else {
x = vowelLookupRow.get(chars[vowelsIndex.get(1)]);
chars[vowelsIndex.get(1)] = vowelTable[x][accentPosition];
}
} else {
x = vowelLookupRow.get(chars[vowelsIndex.get(1)]);
chars[vowelsIndex.get(1)] = vowelTable[x][accentPosition];
}
return String.copyValueOf(chars);
}
private static List<Boolean> getUpperState(String word) {
List<Boolean> uppers = new ArrayList<>();
for (char c : word.toCharArray()) {
uppers.add(Character.isUpperCase(c) ? true : false);
}
return uppers;
}
private static String updateUpperState(String word, List<Boolean> uppers) {
char[] chars = word.toCharArray();
for (int i = 0; i < chars.length; i++) {
chars[i] = uppers.get(i) ? Character.toUpperCase(chars[i]) : chars[i];
}
return String.copyValueOf(chars);
}
public static String correctVnAccentSentence(String sentence) {
String[] words = sentence.split("\\s+");
for (int i = 0; i < words.length; i++) {
List<Boolean> uppers = getUpperState(words[i]);
words[i] = updateUpperState(correctVnAccentWord(words[i]), uppers);
}
return String.join(" ", words);
}
public static void main(String[] args) {
System.out.println(NlpUtils.correctVnAccentSentence("chuôí"));
}
}
@vanchung1995
Copy link

Code java lỗi trong trường hợp này: "İlkay". Lý do lỗi là khi lowerCase thì toCharArray nó sẽ thêm 1 kí tự ' thành ra có 6 kí tự, còn nếu giữ nguyên hoặc upperCase sẽ chỉ có 5 kí tự

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment