seanghay/khmer-lexer.js

## khmer-lexer.js
const SPACE_CODEPOINTS = [
  0xa, // newline r
  0xd, // newline r
  0x9, // tab,
  0x0020, // space
  0x00A0, // no-break space
  0x1680, // Ogham space mark
  0x180E, // Mongolian vowel separator
  0x2000, // en quad
  0x2001, // em quad
  0x2002, // en space (nut)
  0x2003, // em space (mutton)
  0x2004, // three-per-em space (thick space)
  0x2005, // four-per-em space (mid space)
  0x2006, // six-per-em space
  0x2007, // figure space
  0x2008, // punctuation space
  0x2009, // thin space
  0x200A, // hair space
  0x200B, // zero width space
  0x202F, // narrow no-break space
  0x205F, // medium mathematical space
  0x3000, // ideographic space
  0xFEFF, // zero width no-break space
];

export function isSignCoeng(codepoint) {
  return codepoint === 0x17D2;
}

export function isNumeral(codepoint) {
  return codepoint >= 0x17e0 && codepoint <= 0x17e9;
}

export function isConsonant(codepoint) {
  return codepoint >= 0x1780 && codepoint <= 0x17a2;
}

export function isDependentVowel(codepoint) {
  return codepoint >= 0x17b6 && codepoint <= 0x17c5;
}

export function isIndependentVowel(codepoint) {
  return codepoint >= 0x17a5 && codepoint <= 0x17b3;
}

export function isDiacritic(codepoint) {
  return codepoint >= 0x17c6 && codepoint <= 0x17d1;
}

export function isLunarSymbol(codepoint) {
  return codepoint >= 0x19e0 && codepoint <= 0x19ff;
}

export function isPunctuationMark(codepoint) {
  return codepoint >= 0x17d4 && codepoint <= 0x17da;
}

export function isRielSymbol(codepoint) {
  return codepoint === 0x17db
}

export function isSpace(codepoint) {
  return SPACE_CODEPOINTS.indexOf(codepoint) !== -1
}

export const TokenTypeMatcher = {
  Consonant: isConsonant,
  DependentVowel: isDependentVowel,
  IndependentVowel: isIndependentVowel,
  Diacritic: isDiacritic,
  LunarSymbol: isLunarSymbol,
  Numeral: isNumeral,
  PunctuationMark: isPunctuationMark,
  Space: isSpace,
  SignCoeng: isSignCoeng,
  RielSymbol: isRielSymbol,
}

/**
 * Tokenize Khmer characters.
 * @param {string} src
 */
export function* tokenize(src) {
  let pos = 0;

  while (pos < src.length) {
    const currentPos = pos++;
    const cp = src.codePointAt(currentPos);
    const str = src[currentPos];
    let matchedType = null;

    for (const type in TokenTypeMatcher) {
      const matcher = TokenTypeMatcher[type];
      if (!matcher(cp)) continue;
      matchedType = type;
    }

    if (matchedType == null) continue;
    yield ({ type: matchedType, value: str, pos: currentPos });
  }
}
	const SPACE_CODEPOINTS = [
	0xa, // newline r
	0xd, // newline r
	0x9, // tab,
	0x0020, // space
	0x00A0, // no-break space
	0x1680, // Ogham space mark
	0x180E, // Mongolian vowel separator
	0x2000, // en quad
	0x2001, // em quad
	0x2002, // en space (nut)
	0x2003, // em space (mutton)
	0x2004, // three-per-em space (thick space)
	0x2005, // four-per-em space (mid space)
	0x2006, // six-per-em space
	0x2007, // figure space
	0x2008, // punctuation space
	0x2009, // thin space
	0x200A, // hair space
	0x200B, // zero width space
	0x202F, // narrow no-break space
	0x205F, // medium mathematical space
	0x3000, // ideographic space
	0xFEFF, // zero width no-break space
	];

	export function isSignCoeng(codepoint) {
	return codepoint === 0x17D2;
	}

	export function isNumeral(codepoint) {
	return codepoint >= 0x17e0 && codepoint <= 0x17e9;
	}

	export function isConsonant(codepoint) {
	return codepoint >= 0x1780 && codepoint <= 0x17a2;
	}

	export function isDependentVowel(codepoint) {
	return codepoint >= 0x17b6 && codepoint <= 0x17c5;
	}

	export function isIndependentVowel(codepoint) {
	return codepoint >= 0x17a5 && codepoint <= 0x17b3;
	}

	export function isDiacritic(codepoint) {
	return codepoint >= 0x17c6 && codepoint <= 0x17d1;
	}

	export function isLunarSymbol(codepoint) {
	return codepoint >= 0x19e0 && codepoint <= 0x19ff;
	}

	export function isPunctuationMark(codepoint) {
	return codepoint >= 0x17d4 && codepoint <= 0x17da;
	}

	export function isRielSymbol(codepoint) {
	return codepoint === 0x17db
	}

	export function isSpace(codepoint) {
	return SPACE_CODEPOINTS.indexOf(codepoint) !== -1
	}

	export const TokenTypeMatcher = {
	Consonant: isConsonant,
	DependentVowel: isDependentVowel,
	IndependentVowel: isIndependentVowel,
	Diacritic: isDiacritic,
	LunarSymbol: isLunarSymbol,
	Numeral: isNumeral,
	PunctuationMark: isPunctuationMark,
	Space: isSpace,
	SignCoeng: isSignCoeng,
	RielSymbol: isRielSymbol,
	}

	/**
	* Tokenize Khmer characters.
	* @param {string} src
	*/
	export function* tokenize(src) {
	let pos = 0;

	while (pos < src.length) {
	const currentPos = pos++;
	const cp = src.codePointAt(currentPos);
	const str = src[currentPos];
	let matchedType = null;

	for (const type in TokenTypeMatcher) {
	const matcher = TokenTypeMatcher[type];
	if (!matcher(cp)) continue;
	matchedType = type;
	}

	if (matchedType == null) continue;
	yield ({ type: matchedType, value: str, pos: currentPos });
	}
	}