Skip to content

Instantly share code, notes, and snippets.

@inexorabletash
Last active January 5, 2021 17:53
Intl.Segmenter polyfill

Following proposal by @littledan

THIS NO LONGER MATCHES THE PROPOSED API AND SHOULD NOT BE USED

Just proof-of-concept. Do not use in production.

Caveats:

  • Uses Intl.v8BreakIterator if present (which in turn uses ICU to do the actual work), otherwise uses a very poorly written, English-only segmenter.
  • The 'strictness' option for granularity: 'line' is not supported (ignored)

Usage:

// Word breaks (e.g. for creating an index)
const word_segmenter = new Intl.Segmenter('en', {granularity: 'word'});
for (let {segment, breakType} of word_segmenter.segment(string))
  console.log(`${breakType} => ${segment}`);

// Possible line breaks (e.g. places to wrap text if necessary)
const word_segmenter = new Intl.Segmenter('en', {granularity: 'line'});
for (let {segment, breakType} of word_segmenter.segment(string))
  console.log(`${breakType} => ${segment}`);

// Sentence breaks (on punctuation or newlines)
const sentence_segmenter = new Intl.Segmenter('en', {granularity: 'sentence'});
for (let {segment, breakType} of word_segmenter.segment(string))
  console.log(`${breakType} => ${segment}`);

// Grapheme clusters (basic glyph plus diacritics)
const grapheme_segmenter = new Intl.Segmenter('en', {granularity: 'grapheme'});
for (let {segment, breakType} of word_segmenter.segment(string))
  console.log(`${breakType} => ${segment}`);
<!DOCTYPE html>
<title>Text Segmentation APIs</title>
<style>
#out span {
display: inline-block; border: 1px solid red; background-color: red; border-radius: 4px; padding: 2px; margin: 2px;
}
#out span.bt-undefined { background-color: #eeffff; }
#out span.bt-none { background-color: #aaaaaa; }
#out span.bt-letter { background-color: #eeffee; }
#out span.bt-number { background-color: #eeeeff; }
#out span.bt-kana { background-color: #7777ff; }
#out span.bt-ideo { background-color: #77ffff; }
#out span.bt-soft { background-color: #ffffee; }
#out span.bt-hard { background-color: #ffeeee; }
#out span.bt-terminator { background-color: #ffeeff; }
#out span.bt-separator { background-color: #eeffee; }
</style>
<script>//delete Intl.v8BreakIterator</script>
<div id=out></div>
<script src="segment.js"></script>
<script>
const string = `
Your balance is $1,234.56... I think.
Parlez-vous fran\u00E7ais ?
No... Really???
A\u0301
Twas brillig, and the slithy toves
Did gyre and gimble in the wabe:
All mimsy were the borogoves,
And the mome raths outgrabe.
"Beware the Jabberwock, my son!
The jaws that bite, the claws that catch!
Beware the Jubjub bird, and shun
The frumious Bandersnatch!"
He took his vorpal sword in hand:
Long time the manxome foe he sought --
So rested he by the Tumtum tree,
And stood awhile in thought.
And, as in uffish thought he stood,
The Jabberwock, with eyes of flame,
Came whiffling through the tulgey wood,
And burbled as it came!
One, two! One, two! And through and through
The vorpal blade went snicker-snack!
He left it dead, and with its head
He went galumphing back.
"And, has thou slain the Jabberwock?
Come to my arms, my beamish boy!
O frabjous day! Callooh! Callay!'
He chortled in his joy.
Twas brillig, and the slithy toves
Did gyre and gimble in the wabe:
All mimsy were the borogoves,
And the mome raths outgrabe.
`;
function append(granularity, string, props) {
const e = document.createElement(granularity);
if (string)
e.appendChild(document.createTextNode(string));
if (props)
Object.keys(props).forEach(key => { e[key] = props[key]; });
document.querySelector('#out').appendChild(e);
}
['grapheme', 'word', 'sentence', 'line'].forEach(granularity => {
append('h2', granularity);
const segmenter = new Intl.Segmenter('en', {granularity: granularity});
const iterator = segmenter.segment(string);
for (let {segment, breakType} of iterator)
append('span', segment, {className: `bt-${breakType}`, title: breakType});
append('hr');
});
</script>
// Rough polyfill for Intl.Segmenter proposal
//
// https://github.com/tc39/proposal-intl-segmenter/blob/master/README.md
//
// Caveats and Limitations
// * granularity: 'line': 'strictness' option is not supported (ignored)
// * In Chrome, uses v8BreakIterator
// * Otherwise, uses very simplistic rules
// * Ignores locale; only "usable" on English
// * granularity: 'grapheme' does not understand combining characters
// * granularity: 'sentence' does not understand decimals
(function(global) {
if ('Intl' in global && 'Segmenter' in global.Intl)
return;
global.Intl = global.Intl || {};
const GRANULARITIES = ['grapheme', 'word','sentence', 'line'];
// TODO: Implement http://www.unicode.org/reports/tr29/
const RULES = {
grapheme: {
grapheme: /^(.|\n)/
},
word: {
letter: /^[a-z](?:'?[a-z])*/i,
number: /^\d+([,.]\d+)*/
},
sentence: {
terminator: /^[^.?!\r\n]+[.?!]+[\r\n]?/,
separator: /^[^.?!\r\n]+[\r\n]?/
},
line: {
hard: /^\S*[\r\n]/,
soft: /^\S*\s*/
}
};
// Work around bug in v8BreakIterator where ICU's UWordBreak enum is
// used even if granularity is not "word". See the code in
// Runtime_BreakIteratorBreakType in runtime/runtime-i18n.cc for
// details.
function fixBreakType(value, granularity) {
// Undo the mapping of UWordBreak to string
const ruleStatus = {
none: 0, // UBRK_WORD_NONE
number: 100, // UBRK_WORD_NUMBER
letter: 200, // UBRK_WORD_LETTER
kana: 300, // UBRK_WORD_KANA
ideo: 400, // UBRK_WORD_IDEO
unknown: -1
}[value] || 0;
switch (granularity) {
case 'character':
return undefined;
case 'word':
return value;
case 'sentence':
// Map ULineBreakTag rule status to string.
return {
0: 'terminator',
100: 'separator'
}[ruleStatus] || value;
case 'line':
// Map ULineBreakTag rule status to string.
return {
0: 'soft',
100: 'hard'
}[ruleStatus] || value;
default:
return value;
}
}
function segment(locale, granularity, string) {
const breaks = [];
if ('v8BreakIterator' in global.Intl) {
if (granularity === 'grapheme')
granularity = 'character';
const vbi = new self.Intl.v8BreakIterator(locale, {type: granularity});
vbi.adoptText(string);
let last = 0;
let pos = vbi.next();
while (pos !== -1) {
breaks.push({
pos: vbi.current(),
segment: string.slice(last, pos),
breakType: fixBreakType(vbi.breakType(), granularity)
});
last = pos;
pos = vbi.next();
}
} else {
const rules = RULES[granularity];
let pos = 0;
while (pos < string.length) {
let found = false;
for (let rule of Object.keys(rules)) {
const re = rules[rule];
const m = string.slice(pos).match(re);
if (m) {
pos += m[0].length;
breaks.push({
pos: pos,
segment: m[0],
breakType: granularity === 'grapheme' ? undefined : rule
});
found = true;
break;
}
}
if (!found) {
breaks.push({
pos: pos + 1,
segment: string.slice(pos, ++pos),
breakType: 'none'
});
}
}
}
breaks.initial = 0;
return breaks;
}
class $SegmentIterator$ {
constructor(string, breaks) {
this._cur = -1;
this._type = undefined;
this._breaks = breaks;
console.log(breaks);
}
[Symbol.iterator]() { return this; }
next() {
if (this._cur < this._breaks.length)
++this._cur;
if (this._cur >= this._breaks.length) {
this._type = undefined;
return {done: true, value: undefined};
}
this._type = this._breaks[this._cur].breakType;
return {
done: false,
value: {
segment: this._breaks[this._cur].segment,
breakType: this._breaks[this._cur].breakType
}
};
}
following(index = undefined) {
if (!this._breaks.length)
return true;
if (index === undefined) {
if (this._cur < this._breaks.length)
++this._cur;
} else {
// TODO: binary search
for (this._cur = 0;
this._cur < this._breaks.length
&& this._breaks[this._cur].pos < index;
++this._cur) {}
}
this._type = this._cur < this._breaks.length
? this._breaks[this._cur].breakType : undefined;
return this._cur + 1 >= this._breaks.length;
}
preceding(index = undefined) {
if (!this._breaks.length)
return true;
if (index === undefined) {
if (this._cur >= this._breaks.length)
--this._cur;
if (this._cur >= 0)
--this._cur;
} else {
// TODO: binary search
for (this._cur = this._breaks.length - 1;
this._cur >= 0
&& this._breaks[this._cur].pos >= index;
--this._cur) {}
}
this._type =
this._cur + 1 >= this._breaks.length ? undefined :
this._breaks[this._cur + 1].breakType;
return this._cur < 0;
}
get position() {
if (this._cur < 0 || !this._breaks.length)
return this._breaks.initial;
if (this._cur >= this._breaks.length)
return this._breaks[this._breaks.length - 1].pos;
return this._breaks[this._cur].pos;
}
get breakType() {
return this._type;
}
}
global.Intl.Segmenter = class Segmenter {
constructor(locale, options) {
this._locale = Array.isArray(locale)
? locale.map(s => String(s)) : String(locale || navigator.language);
options = Object.assign({granularity: 'grapheme'}, options);
this._granularity = GRANULARITIES.includes(options.granularity)
? options.granularity : 'grapheme';
}
segment(string) {
return new $SegmentIterator$(
string, segment(this._locale, this._granularity, string));
}
};
}(self));
@inexorabletash
Copy link
Author

I guess the answer would be: what ICU's behavior is...

@arv
Copy link

arv commented Mar 1, 2018

@inexorabletash What is the license for this?

@tomasdev
Copy link

Emoji does not work properly when using grapheme. 😭

@methyl
Copy link

methyl commented Jun 15, 2020

If you need full ICU support outside of Chrome, consider https://github.com/surferseo/intl-segmenter-polyfill / https://www.npmjs.com/package/intl-segmenter-polyfill. It's a WASM module built with ICU to fully polyfill Intl.Segmenter proposal.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment