inexorabletash/@ Intl.Segmenter polyfill.md

## @ Intl.Segmenter polyfill.md

      
    Raw
  

              @ Intl.Segmenter polyfill.md
            
          
    Following proposal by @littledan
THIS NO LONGER MATCHES THE PROPOSED API AND SHOULD NOT BE USED

Proposal
Slides

Just proof-of-concept. Do not use in production.
Caveats:


Uses Intl.v8BreakIterator if present (which in turn uses ICU to do the actual work),
otherwise uses a very poorly written, English-only segmenter.
The 'strictness' option for granularity: 'line' is not supported (ignored)

Usage:

// Word breaks (e.g. for creating an index)
const word_segmenter = new Intl.Segmenter('en', {granularity: 'word'});
for (let {segment, breakType} of word_segmenter.segment(string))
  console.log(`${breakType} => ${segment}`);

// Possible line breaks (e.g. places to wrap text if necessary)
const word_segmenter = new Intl.Segmenter('en', {granularity: 'line'});
for (let {segment, breakType} of word_segmenter.segment(string))
  console.log(`${breakType} => ${segment}`);

// Sentence breaks (on punctuation or newlines)
const sentence_segmenter = new Intl.Segmenter('en', {granularity: 'sentence'});
for (let {segment, breakType} of word_segmenter.segment(string))
  console.log(`${breakType} => ${segment}`);

// Grapheme clusters (basic glyph plus diacritics)
const grapheme_segmenter = new Intl.Segmenter('en', {granularity: 'grapheme'});
for (let {segment, breakType} of word_segmenter.segment(string))
  console.log(`${breakType} => ${segment}`);

  
## example.html
<!DOCTYPE html>
<title>Text Segmentation APIs</title>
<style>
#out span {
  display: inline-block; border: 1px solid red; background-color: red; border-radius: 4px; padding: 2px; margin: 2px;
}

#out span.bt-undefined { background-color: #eeffff; }

#out span.bt-none { background-color: #aaaaaa; }
#out span.bt-letter { background-color: #eeffee; }
#out span.bt-number { background-color: #eeeeff; }
#out span.bt-kana { background-color: #7777ff; }
#out span.bt-ideo { background-color: #77ffff; }

#out span.bt-soft { background-color: #ffffee; }
#out span.bt-hard { background-color: #ffeeee; }

#out span.bt-terminator { background-color: #ffeeff; }
#out span.bt-separator { background-color: #eeffee; }

</style>
<script>//delete Intl.v8BreakIterator</script>
<div id=out></div>
<script src="segment.js"></script>
<script>

const string = `
Your balance is $1,234.56... I think.
Parlez-vous fran\u00E7ais ?
No... Really???
A\u0301

Twas brillig, and the slithy toves
  Did gyre and gimble in the wabe:
All mimsy were the borogoves,
  And the mome raths outgrabe.

"Beware the Jabberwock, my son!
  The jaws that bite, the claws that catch!
Beware the Jubjub bird, and shun
  The frumious Bandersnatch!"

He took his vorpal sword in hand:
  Long time the manxome foe he sought --
So rested he by the Tumtum tree,
  And stood awhile in thought.

And, as in uffish thought he stood,
  The Jabberwock, with eyes of flame,
Came whiffling through the tulgey wood,
  And burbled as it came!

One, two! One, two! And through and through
  The vorpal blade went snicker-snack!
He left it dead, and with its head
  He went galumphing back.

"And, has thou slain the Jabberwock?
  Come to my arms, my beamish boy!
O frabjous day! Callooh! Callay!'
  He chortled in his joy.

Twas brillig, and the slithy toves
  Did gyre and gimble in the wabe:
All mimsy were the borogoves,
  And the mome raths outgrabe.
`;

function append(granularity, string, props) {
  const e = document.createElement(granularity);
  if (string)
    e.appendChild(document.createTextNode(string));
  if (props)
    Object.keys(props).forEach(key => { e[key] = props[key]; });
  document.querySelector('#out').appendChild(e);
}

['grapheme', 'word', 'sentence', 'line'].forEach(granularity => {
  append('h2', granularity);
  const segmenter = new Intl.Segmenter('en', {granularity: granularity});
  const iterator = segmenter.segment(string);
  for (let {segment, breakType} of iterator)
    append('span', segment, {className: `bt-${breakType}`, title: breakType});
  append('hr');
});
</script>

## segment.js
// Rough polyfill for Intl.Segmenter proposal
//
// https://github.com/tc39/proposal-intl-segmenter/blob/master/README.md
//
// Caveats and Limitations
//  * granularity: 'line': 'strictness' option is not supported (ignored)
//  * In Chrome, uses v8BreakIterator
//  * Otherwise, uses very simplistic rules
//    * Ignores locale; only "usable" on English
//    * granularity: 'grapheme' does not understand combining characters
//    * granularity: 'sentence' does not understand decimals

(function(global) {
  if ('Intl' in global && 'Segmenter' in global.Intl)
    return;

  global.Intl = global.Intl || {};

  const GRANULARITIES = ['grapheme', 'word','sentence', 'line'];

  // TODO: Implement http://www.unicode.org/reports/tr29/
  const RULES = {
    grapheme: {
      grapheme: /^(.|\n)/
    },
    word: {
      letter: /^[a-z](?:'?[a-z])*/i,
      number: /^\d+([,.]\d+)*/
    },
    sentence: {
      terminator: /^[^.?!\r\n]+[.?!]+[\r\n]?/,
      separator: /^[^.?!\r\n]+[\r\n]?/
    },
    line: {
      hard: /^\S*[\r\n]/,
      soft: /^\S*\s*/
    }
  };

  // Work around bug in v8BreakIterator where ICU's UWordBreak enum is
  // used even if granularity is not "word". See the code in
  // Runtime_BreakIteratorBreakType in runtime/runtime-i18n.cc for
  // details.
  function fixBreakType(value, granularity) {
    // Undo the mapping of UWordBreak to string
    const ruleStatus = {
      none: 0, // UBRK_WORD_NONE
      number: 100, // UBRK_WORD_NUMBER
      letter: 200, // UBRK_WORD_LETTER
      kana: 300, // UBRK_WORD_KANA
      ideo: 400, // UBRK_WORD_IDEO
      unknown: -1
    }[value] || 0;


    switch (granularity) {
    case 'character':
      return undefined;
    case 'word':
      return value;
    case 'sentence':
      // Map ULineBreakTag rule status to string.
      return {
        0: 'terminator',
        100: 'separator'
      }[ruleStatus] || value;
    case 'line':
      // Map ULineBreakTag rule status to string.
      return {
        0: 'soft',
        100: 'hard'
      }[ruleStatus] || value;
    default:
      return value;
    }
  }

  function segment(locale, granularity, string) {
    const breaks = [];
    if ('v8BreakIterator' in global.Intl) {
      if (granularity === 'grapheme')
        granularity = 'character';
      const vbi = new self.Intl.v8BreakIterator(locale, {type: granularity});
      vbi.adoptText(string);
      let last = 0;
      let pos = vbi.next();
      while (pos !== -1) {
        breaks.push({
          pos: vbi.current(),
          segment: string.slice(last, pos),
          breakType: fixBreakType(vbi.breakType(), granularity)
        });
        last = pos;
        pos = vbi.next();
      }
    } else {
      const rules = RULES[granularity];
      let pos = 0;
      while (pos < string.length) {
        let found = false;
        for (let rule of Object.keys(rules)) {
          const re = rules[rule];
          const m = string.slice(pos).match(re);
          if (m) {
            pos += m[0].length;
            breaks.push({
              pos: pos,
              segment: m[0],
              breakType: granularity === 'grapheme' ? undefined : rule
            });
            found = true;
            break;
          }
        }
        if (!found) {
          breaks.push({
            pos: pos + 1,
            segment: string.slice(pos, ++pos),
            breakType: 'none'
          });
        }
      }
    }
    breaks.initial = 0;
    return breaks;
  }

  class $SegmentIterator$ {
    constructor(string, breaks) {
      this._cur = -1;
      this._type = undefined;
      this._breaks = breaks;
      console.log(breaks);
    }

    [Symbol.iterator]() { return this; }

    next() {
      if (this._cur < this._breaks.length)
        ++this._cur;

      if (this._cur >= this._breaks.length) {
        this._type = undefined;
        return {done: true, value: undefined};
      }

      this._type = this._breaks[this._cur].breakType;
      return {
        done: false,
        value: {
          segment: this._breaks[this._cur].segment,
          breakType: this._breaks[this._cur].breakType
        }
      };
    }

    following(index = undefined) {
      if (!this._breaks.length)
        return true;
      if (index === undefined) {
        if (this._cur < this._breaks.length)
          ++this._cur;
      } else {
        // TODO: binary search
        for (this._cur = 0;
             this._cur < this._breaks.length
             && this._breaks[this._cur].pos < index;
             ++this._cur) {}
      }

      this._type = this._cur < this._breaks.length
        ? this._breaks[this._cur].breakType : undefined;
      return this._cur + 1 >= this._breaks.length;
    }

    preceding(index = undefined) {
      if (!this._breaks.length)
        return true;
      if (index === undefined) {
        if (this._cur >= this._breaks.length)
          --this._cur;
        if (this._cur >= 0)
          --this._cur;
      } else {
        // TODO: binary search
        for (this._cur = this._breaks.length - 1;
             this._cur >= 0
             && this._breaks[this._cur].pos >= index;
             --this._cur) {}
      }

      this._type =
        this._cur + 1 >= this._breaks.length ? undefined :
        this._breaks[this._cur + 1].breakType;
      return this._cur < 0;
    }

    get position() {
      if (this._cur < 0 || !this._breaks.length)
        return this._breaks.initial;
      if (this._cur >= this._breaks.length)
        return this._breaks[this._breaks.length - 1].pos;
      return this._breaks[this._cur].pos;
    }

    get breakType() {
      return this._type;
    }
  }

  global.Intl.Segmenter = class Segmenter {
    constructor(locale, options) {
      this._locale = Array.isArray(locale)
        ? locale.map(s => String(s)) : String(locale || navigator.language);
      options = Object.assign({granularity: 'grapheme'}, options);
      this._granularity = GRANULARITIES.includes(options.granularity)
        ? options.granularity : 'grapheme';
    }

    segment(string) {
      return new $SegmentIterator$(
        string, segment(this._locale, this._granularity, string));
    }
  };
}(self));
	<!DOCTYPE html>
	<title>Text Segmentation APIs</title>
	<style>
	#out span {
	display: inline-block; border: 1px solid red; background-color: red; border-radius: 4px; padding: 2px; margin: 2px;
	}

	#out span.bt-undefined { background-color: #eeffff; }

	#out span.bt-none { background-color: #aaaaaa; }
	#out span.bt-letter { background-color: #eeffee; }
	#out span.bt-number { background-color: #eeeeff; }
	#out span.bt-kana { background-color: #7777ff; }
	#out span.bt-ideo { background-color: #77ffff; }

	#out span.bt-soft { background-color: #ffffee; }
	#out span.bt-hard { background-color: #ffeeee; }

	#out span.bt-terminator { background-color: #ffeeff; }
	#out span.bt-separator { background-color: #eeffee; }

	</style>
	<script>//delete Intl.v8BreakIterator</script>
	<div id=out></div>
	<script src="segment.js"></script>
	<script>

	const string = `
	Your balance is $1,234.56... I think.
	Parlez-vous fran\u00E7ais ?
	No... Really???
	A\u0301

	Twas brillig, and the slithy toves
	Did gyre and gimble in the wabe:
	All mimsy were the borogoves,
	And the mome raths outgrabe.

	"Beware the Jabberwock, my son!
	The jaws that bite, the claws that catch!
	Beware the Jubjub bird, and shun
	The frumious Bandersnatch!"

	He took his vorpal sword in hand:
	Long time the manxome foe he sought --
	So rested he by the Tumtum tree,
	And stood awhile in thought.

	And, as in uffish thought he stood,
	The Jabberwock, with eyes of flame,
	Came whiffling through the tulgey wood,
	And burbled as it came!

	One, two! One, two! And through and through
	The vorpal blade went snicker-snack!
	He left it dead, and with its head
	He went galumphing back.

	"And, has thou slain the Jabberwock?
	Come to my arms, my beamish boy!
	O frabjous day! Callooh! Callay!'
	He chortled in his joy.

	Twas brillig, and the slithy toves
	Did gyre and gimble in the wabe:
	All mimsy were the borogoves,
	And the mome raths outgrabe.
	`;

	function append(granularity, string, props) {
	const e = document.createElement(granularity);
	if (string)
	e.appendChild(document.createTextNode(string));
	if (props)
	Object.keys(props).forEach(key => { e[key] = props[key]; });
	document.querySelector('#out').appendChild(e);
	}

	['grapheme', 'word', 'sentence', 'line'].forEach(granularity => {
	append('h2', granularity);
	const segmenter = new Intl.Segmenter('en', {granularity: granularity});
	const iterator = segmenter.segment(string);
	for (let {segment, breakType} of iterator)
	append('span', segment, {className: `bt-${breakType}`, title: breakType});
	append('hr');
	});
	</script>
	// Rough polyfill for Intl.Segmenter proposal
	//
	// https://github.com/tc39/proposal-intl-segmenter/blob/master/README.md
	//
	// Caveats and Limitations
	// * granularity: 'line': 'strictness' option is not supported (ignored)
	// * In Chrome, uses v8BreakIterator
	// * Otherwise, uses very simplistic rules
	// * Ignores locale; only "usable" on English
	// * granularity: 'grapheme' does not understand combining characters
	// * granularity: 'sentence' does not understand decimals

	(function(global) {
	if ('Intl' in global && 'Segmenter' in global.Intl)
	return;

	global.Intl = global.Intl \|\| {};

	const GRANULARITIES = ['grapheme', 'word','sentence', 'line'];

	// TODO: Implement http://www.unicode.org/reports/tr29/
	const RULES = {
	grapheme: {
	grapheme: /^(.\|\n)/
	},
	word: {
	letter: /^[a-z](?:'?[a-z])*/i,
	number: /^\d+([,.]\d+)*/
	},
	sentence: {
	terminator: /^[^.?!\r\n]+[.?!]+[\r\n]?/,
	separator: /^[^.?!\r\n]+[\r\n]?/
	},
	line: {
	hard: /^\S*[\r\n]/,
	soft: /^\S\s/
	}
	};

	// Work around bug in v8BreakIterator where ICU's UWordBreak enum is
	// used even if granularity is not "word". See the code in
	// Runtime_BreakIteratorBreakType in runtime/runtime-i18n.cc for
	// details.
	function fixBreakType(value, granularity) {
	// Undo the mapping of UWordBreak to string
	const ruleStatus = {
	none: 0, // UBRK_WORD_NONE
	number: 100, // UBRK_WORD_NUMBER
	letter: 200, // UBRK_WORD_LETTER
	kana: 300, // UBRK_WORD_KANA
	ideo: 400, // UBRK_WORD_IDEO
	unknown: -1
	}[value] \|\| 0;


	switch (granularity) {
	case 'character':
	return undefined;
	case 'word':
	return value;
	case 'sentence':
	// Map ULineBreakTag rule status to string.
	return {
	0: 'terminator',
	100: 'separator'
	}[ruleStatus] \|\| value;
	case 'line':
	// Map ULineBreakTag rule status to string.
	return {
	0: 'soft',
	100: 'hard'
	}[ruleStatus] \|\| value;
	default:
	return value;
	}
	}

	function segment(locale, granularity, string) {
	const breaks = [];
	if ('v8BreakIterator' in global.Intl) {
	if (granularity === 'grapheme')
	granularity = 'character';
	const vbi = new self.Intl.v8BreakIterator(locale, {type: granularity});
	vbi.adoptText(string);
	let last = 0;
	let pos = vbi.next();
	while (pos !== -1) {
	breaks.push({
	pos: vbi.current(),
	segment: string.slice(last, pos),
	breakType: fixBreakType(vbi.breakType(), granularity)
	});
	last = pos;
	pos = vbi.next();
	}
	} else {
	const rules = RULES[granularity];
	let pos = 0;
	while (pos < string.length) {
	let found = false;
	for (let rule of Object.keys(rules)) {
	const re = rules[rule];
	const m = string.slice(pos).match(re);
	if (m) {
	pos += m[0].length;
	breaks.push({
	pos: pos,
	segment: m[0],
	breakType: granularity === 'grapheme' ? undefined : rule
	});
	found = true;
	break;
	}
	}
	if (!found) {
	breaks.push({
	pos: pos + 1,
	segment: string.slice(pos, ++pos),
	breakType: 'none'
	});
	}
	}
	}
	breaks.initial = 0;
	return breaks;
	}

	class $SegmentIterator$ {
	constructor(string, breaks) {
	this._cur = -1;
	this._type = undefined;
	this._breaks = breaks;
	console.log(breaks);
	}

	[Symbol.iterator]() { return this; }

	next() {
	if (this._cur < this._breaks.length)
	++this._cur;

	if (this._cur >= this._breaks.length) {
	this._type = undefined;
	return {done: true, value: undefined};
	}

	this._type = this._breaks[this._cur].breakType;
	return {
	done: false,
	value: {
	segment: this._breaks[this._cur].segment,
	breakType: this._breaks[this._cur].breakType
	}
	};
	}

	following(index = undefined) {
	if (!this._breaks.length)
	return true;
	if (index === undefined) {
	if (this._cur < this._breaks.length)
	++this._cur;
	} else {
	// TODO: binary search
	for (this._cur = 0;
	this._cur < this._breaks.length
	&& this._breaks[this._cur].pos < index;
	++this._cur) {}
	}

	this._type = this._cur < this._breaks.length
	? this._breaks[this._cur].breakType : undefined;
	return this._cur + 1 >= this._breaks.length;
	}

	preceding(index = undefined) {
	if (!this._breaks.length)
	return true;
	if (index === undefined) {
	if (this._cur >= this._breaks.length)
	--this._cur;
	if (this._cur >= 0)
	--this._cur;
	} else {
	// TODO: binary search
	for (this._cur = this._breaks.length - 1;
	this._cur >= 0
	&& this._breaks[this._cur].pos >= index;
	--this._cur) {}
	}

	this._type =
	this._cur + 1 >= this._breaks.length ? undefined :
	this._breaks[this._cur + 1].breakType;
	return this._cur < 0;
	}

	get position() {
	if (this._cur < 0 \|\| !this._breaks.length)
	return this._breaks.initial;
	if (this._cur >= this._breaks.length)
	return this._breaks[this._breaks.length - 1].pos;
	return this._breaks[this._cur].pos;
	}

	get breakType() {
	return this._type;
	}
	}

	global.Intl.Segmenter = class Segmenter {
	constructor(locale, options) {
	this._locale = Array.isArray(locale)
	? locale.map(s => String(s)) : String(locale \|\| navigator.language);
	options = Object.assign({granularity: 'grapheme'}, options);
	this._granularity = GRANULARITIES.includes(options.granularity)
	? options.granularity : 'grapheme';
	}

	segment(string) {
	return new $SegmentIterator$(
	string, segment(this._locale, this._granularity, string));
	}
	};
	}(self));