Skip to content

Instantly share code, notes, and snippets.

@zawhtutwin
Last active February 9, 2024 22:03
Show Gist options
  • Save zawhtutwin/de145f6c5942d1083491fe6404ae4270 to your computer and use it in GitHub Desktop.
Save zawhtutwin/de145f6c5942d1083491fe6404ae4270 to your computer and use it in GitHub Desktop.
const _CATEGORY_NAMES = ['C', 'M', 'V', 'S', 'A', 'F', 'I', 'E', 'G', 'D', 'P', 'W'];
// Category's Unicode Code Point
const _CATEGORY_RANGE = [
['C', range(0x1021,0x1000)], // Consonants
['M', range(0x103E,0x103B)], // Medials
['V', range(0x1032,0x102B)], // Dependent Vowel Signs
['S', [0x1039]], // Myanmar Sign Virama
['A', [0x103A]], // Myanmar Sign Asat
['F', range(0x1038,0x1036)], // Dependent Various Signs
['I', [0x1024, 0x1027, 0x102A, 0x104C, 0x104D, 0x104F]], // Independent Vowels, Independent Various Signs
['E', [0x1023, 0x1025, 0x1026, 0x1029, 0x104E]], // Independent Vowels, Myanmar Symbol Aforementioned
['G', [0x103F]], // Myanmar Letter Great Sa
['D', range(0x1049,0x1040)], // Myanmar Digits
['P', range(0x104B,0x104A)], // Punctuation Marks
['W', [0x0020]], // White space
];
function range(end,start){
const r = Array.from({ length: end - start + 1 }, (_, i) => start + i);
return r;
}
//console.log(_CATEGORY_NAMES);
//console.log(_CATEGORY_RANGE);
function getCateGory(sentence){
console.log(sentence);
let arr
= [...sentence].map(function(v){
let list = _CATEGORY_RANGE.filter(function(cat){
c = v.charCodeAt(0);
return cat[1].includes(c)
}).map(function (item){
return {cat:item[0],val:v};
});
return list[0];
});
return arr;
}
const _LETTER_SEQUENCE_TABLE_INDEX = {
'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'I': 6, 'M': 7, 'P': 8, 'S': 9, 'V': 10, 'W': 11
};
const _LETTER_SEQUENCE_TABLE_2ND_CHARACTER = {
'A': [-1, -2, 1, 1, 0, -1, 1, 0, 1, 0, 0, 1],
'C': [0, -2, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1],
'D': [-1, 1, 0, 1, -1, -1, 1, -1, 1, -1, -1, 1],
'E': [-1, -2, 1, 1, 2, 0, 1, -1, 1, -1, 0, 1],
'F': [-1, -2, 1, 1, 2, -1, 1, -1, 1, -1, -1, 1],
'G': [-1, 1, 1, 1, 0, -1, 1, -1, 1, -1, 0, 1],
'I': [-1, 1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 1],
'M': [2, -2, 1, 1, 0, 0, 1, 0, 1, -1, 0, 1],
'P': [-1, 1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 1],
'S': [-1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
'V': [2, -2, 1, 1, 0, 0, 1, -1, 1, -1, 0, 1],
'W': [-1, 1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 0],
};
const _LETTER_SEQUENCE_TABLE_3RD_CHARACTER = {
'AC': [3, 1, 1, 1, 1, 1, 1, -2, 1, 1, 1, 1],
'CC': [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1],
'EC': [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1],
'FC': [3, 1, 1, 1, 1, 1, 1, -2, 1, 1, 1, 1],
'MC': [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1],
'VC': [0, 1, 1, 1, 1, 1, 1, -2, 1, 0, 1, 1],
};
const _LETTER_SEQUENCE_TABLE_4TH_CHARACTER = {
'ACM': [4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
'FCM': [4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
'VCM': [4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
};
function _getSyllableBreakStatus(categorys, categorysLen) {
let letterSequenceTable;
if (categorysLen === 2) {
letterSequenceTable = _LETTER_SEQUENCE_TABLE_2ND_CHARACTER;
} else if (categorysLen === 3) {
letterSequenceTable = _LETTER_SEQUENCE_TABLE_3RD_CHARACTER;
} else if (categorysLen === 4) {
letterSequenceTable = _LETTER_SEQUENCE_TABLE_4TH_CHARACTER;
} else {
letterSequenceTable = null;
}
if (letterSequenceTable !== null) {
let m = categorys.slice(0, categorysLen-1);
console.log(m);
let status = letterSequenceTable[m];
if (status !== undefined) {
let op = categorys[categorysLen - 1];
let index = _LETTER_SEQUENCE_TABLE_INDEX[op];
if (index !== undefined) {
return status[index];
}
}
}
return -1;
}
function cut(){
let sen = 'စာကြည့်တိုက်အတွင်းရှိကလေးငယ်များဆူဆူညံညံမပြုရန်သတိပြုပါ။';
let len = sen.length;
let categories = getCateGory(sen).map((m)=>(m.cat)).join("");
let text = getCateGory(sen).map((m)=>(m.val));
let start = 0;
t =0;
let residueLen = sen.length;
let finalStr = "";
let finalText = "";
let count=1;
while(residueLen>1){
let sl = categories.slice(start,start+2);
let i = _getSyllableBreakStatus(sl,2);
//console.log(categories.slice(start,start+2)+" "+i+ " "+residueLen);
if(i==-2 && residueLen >= 3){
sl = categories.slice(start,start+3);
i = _getSyllableBreakStatus(sl,3);
}
if(i==-2 && residueLen >= 4){
sl = categories.slice(start,start+4);
i = _getSyllableBreakStatus(sl,4);
}
if(i==-2){
finalStr += categories[start]+"|";
finalText += text[start];
}
if(i==1){
finalText += text[start]+"|";
finalStr += categories[start]+"|";
}
if(i==0){
finalStr += categories[start];
finalText += text[start];
}
if(i==-1){
finalStr += categories[start];
finalText += text[start];
}
count++;
residueLen = len - start;
start++;
}
console.log(finalStr);
//"CV|CMCAF|CVVCA|C|CMCAF|CMV|C|CVF|CCA|CMVF|CV|CV|CF|CF|C|CMV|CCA|C|CV|CMV|CV|P"
console.log(finalText);
//"စာ|ကြည့်|တိုက်|အ|တွင်း|ရှိ|က|လေး|ငယ်|များ|ဆူ|ဆူ|ညံ|ညံ|မ|ပြု|ရန်|သ|တိ|ပြု|ပါ|။"
}
cut();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment