Skip to content

Instantly share code, notes, and snippets.

@seanghay
Last active October 24, 2022 07:17
Show Gist options
  • Save seanghay/c7b209b40ee0677ee6e612d309cbaef4 to your computer and use it in GitHub Desktop.
Save seanghay/c7b209b40ee0677ee6e612d309cbaef4 to your computer and use it in GitHub Desktop.
Khmer Tokenizer
const SPACE_CODEPOINTS = [
0xa, // newline r
0xd, // newline r
0x9, // tab,
0x0020, // space
0x00A0, // no-break space
0x1680, // Ogham space mark
0x180E, // Mongolian vowel separator
0x2000, // en quad
0x2001, // em quad
0x2002, // en space (nut)
0x2003, // em space (mutton)
0x2004, // three-per-em space (thick space)
0x2005, // four-per-em space (mid space)
0x2006, // six-per-em space
0x2007, // figure space
0x2008, // punctuation space
0x2009, // thin space
0x200A, // hair space
0x200B, // zero width space
0x202F, // narrow no-break space
0x205F, // medium mathematical space
0x3000, // ideographic space
0xFEFF, // zero width no-break space
];
export function isSignCoeng(codepoint) {
return codepoint === 0x17D2;
}
export function isNumeral(codepoint) {
return codepoint >= 0x17e0 && codepoint <= 0x17e9;
}
export function isConsonant(codepoint) {
return codepoint >= 0x1780 && codepoint <= 0x17a2;
}
export function isDependentVowel(codepoint) {
return codepoint >= 0x17b6 && codepoint <= 0x17c5;
}
export function isIndependentVowel(codepoint) {
return codepoint >= 0x17a5 && codepoint <= 0x17b3;
}
export function isDiacritic(codepoint) {
return codepoint >= 0x17c6 && codepoint <= 0x17d1;
}
export function isLunarSymbol(codepoint) {
return codepoint >= 0x19e0 && codepoint <= 0x19ff;
}
export function isPunctuationMark(codepoint) {
return codepoint >= 0x17d4 && codepoint <= 0x17da;
}
export function isRielSymbol(codepoint) {
return codepoint === 0x17db
}
export function isSpace(codepoint) {
return SPACE_CODEPOINTS.indexOf(codepoint) !== -1
}
export const TokenTypeMatcher = {
Consonant: isConsonant,
DependentVowel: isDependentVowel,
IndependentVowel: isIndependentVowel,
Diacritic: isDiacritic,
LunarSymbol: isLunarSymbol,
Numeral: isNumeral,
PunctuationMark: isPunctuationMark,
Space: isSpace,
SignCoeng: isSignCoeng,
RielSymbol: isRielSymbol,
}
/**
* Tokenize Khmer characters.
* @param {string} src
*/
export function* tokenize(src) {
let pos = 0;
while (pos < src.length) {
const currentPos = pos++;
const cp = src.codePointAt(currentPos);
const str = src[currentPos];
let matchedType = null;
for (const type in TokenTypeMatcher) {
const matcher = TokenTypeMatcher[type];
if (!matcher(cp)) continue;
matchedType = type;
}
if (matchedType == null) continue;
yield ({ type: matchedType, value: str, pos: currentPos });
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment