Last active
October 24, 2022 07:17
-
-
Save seanghay/c7b209b40ee0677ee6e612d309cbaef4 to your computer and use it in GitHub Desktop.
Khmer Tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const SPACE_CODEPOINTS = [ | |
0xa, // newline r | |
0xd, // newline r | |
0x9, // tab, | |
0x0020, // space | |
0x00A0, // no-break space | |
0x1680, // Ogham space mark | |
0x180E, // Mongolian vowel separator | |
0x2000, // en quad | |
0x2001, // em quad | |
0x2002, // en space (nut) | |
0x2003, // em space (mutton) | |
0x2004, // three-per-em space (thick space) | |
0x2005, // four-per-em space (mid space) | |
0x2006, // six-per-em space | |
0x2007, // figure space | |
0x2008, // punctuation space | |
0x2009, // thin space | |
0x200A, // hair space | |
0x200B, // zero width space | |
0x202F, // narrow no-break space | |
0x205F, // medium mathematical space | |
0x3000, // ideographic space | |
0xFEFF, // zero width no-break space | |
]; | |
export function isSignCoeng(codepoint) { | |
return codepoint === 0x17D2; | |
} | |
export function isNumeral(codepoint) { | |
return codepoint >= 0x17e0 && codepoint <= 0x17e9; | |
} | |
export function isConsonant(codepoint) { | |
return codepoint >= 0x1780 && codepoint <= 0x17a2; | |
} | |
export function isDependentVowel(codepoint) { | |
return codepoint >= 0x17b6 && codepoint <= 0x17c5; | |
} | |
export function isIndependentVowel(codepoint) { | |
return codepoint >= 0x17a5 && codepoint <= 0x17b3; | |
} | |
export function isDiacritic(codepoint) { | |
return codepoint >= 0x17c6 && codepoint <= 0x17d1; | |
} | |
export function isLunarSymbol(codepoint) { | |
return codepoint >= 0x19e0 && codepoint <= 0x19ff; | |
} | |
export function isPunctuationMark(codepoint) { | |
return codepoint >= 0x17d4 && codepoint <= 0x17da; | |
} | |
export function isRielSymbol(codepoint) { | |
return codepoint === 0x17db | |
} | |
export function isSpace(codepoint) { | |
return SPACE_CODEPOINTS.indexOf(codepoint) !== -1 | |
} | |
export const TokenTypeMatcher = { | |
Consonant: isConsonant, | |
DependentVowel: isDependentVowel, | |
IndependentVowel: isIndependentVowel, | |
Diacritic: isDiacritic, | |
LunarSymbol: isLunarSymbol, | |
Numeral: isNumeral, | |
PunctuationMark: isPunctuationMark, | |
Space: isSpace, | |
SignCoeng: isSignCoeng, | |
RielSymbol: isRielSymbol, | |
} | |
/** | |
* Tokenize Khmer characters. | |
* @param {string} src | |
*/ | |
export function* tokenize(src) { | |
let pos = 0; | |
while (pos < src.length) { | |
const currentPos = pos++; | |
const cp = src.codePointAt(currentPos); | |
const str = src[currentPos]; | |
let matchedType = null; | |
for (const type in TokenTypeMatcher) { | |
const matcher = TokenTypeMatcher[type]; | |
if (!matcher(cp)) continue; | |
matchedType = type; | |
} | |
if (matchedType == null) continue; | |
yield ({ type: matchedType, value: str, pos: currentPos }); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment