Last active August 10, 2022 07:09
Thai Word Segmentation in JavaScript using V8 Break Iterator
function* gen_words(text){
const it = new Intl.v8BreakIterator(['th'])
let start = it.first()
while (true) {
let end =
if (end === -1) break
yield text.slice(start, end);
start = end
// Usage:
// segment('สวัสดีครับ สบายดีไหม')
// ['สวัสดี', 'ครับ', ' ', 'สบาย', 'ดี', 'ไหม']
function segment(text){
return [...gen_words(text)]
// Internally, it uses ICU
