korakot/segment.js

## segment.js
function* gen_words(text){
  const it = new Intl.v8BreakIterator(['th'])
  it.adoptText(text)
  let start = it.first()
  while (true) {
    let end = it.next()
    if (end === -1) break
    yield text.slice(start, end);
    start = end
  }
}

// Usage:
// segment('สวัสดีครับ สบายดีไหม')
// ['สวัสดี', 'ครับ', ' ', 'สบาย', 'ดี', 'ไหม']
function segment(text){
  return [...gen_words(text)]
}

// Internally, it uses ICU
// https://chromium.googlesource.com/external/v8-i18n/+/refs/heads/master/src/break-iterator.cc
	function* gen_words(text){
	const it = new Intl.v8BreakIterator(['th'])
	it.adoptText(text)
	let start = it.first()
	while (true) {
	let end = it.next()
	if (end === -1) break
	yield text.slice(start, end);
	start = end
	}
	}

	// Usage:
	// segment('สวัสดีครับ สบายดีไหม')
	// ['สวัสดี', 'ครับ', ' ', 'สบาย', 'ดี', 'ไหม']
	function segment(text){
	return [...gen_words(text)]
	}

	// Internally, it uses ICU
	// https://chromium.googlesource.com/external/v8-i18n/+/refs/heads/master/src/break-iterator.cc