Hugo-ter-Doest/Natural sentence tokenizer - analysis current regular expression

## Natural sentence tokenizer - analysis current regular expression
// break string up in to sentences based on punctation and quotation marks
var tokens = text.match(/(?<=\s+|^)[\"\'\‘\“\'\"\[\(\{\⟨](.*?[.?!])(\s[.?!])*[\"\'\’\”\'\"\]\)\}\⟩](?=\s+|$)|(?<=\s+|^)\S(.*?[.?!])(\s[.?!])*(?=\s+|$)/g);

/* Breakdown of the regular expression
/
  (?<=\s+|^)    // Lookbehind is whitespace or start-of-input FOLLOWED BY:
  [\"\'\‘\“\'\"\[\(\{\⟨]? // Any brackets or quotation characters (optional)
    (.*?[.?!])  // any sequence of characters followed by punctation
    (\s[.?!])*  // any combination of whitespace and punctuation
  [\"\'\’\”\'\"\]\)\}\⟩]? // Any brackets or quotation characters (optional)
  (?=\s+|$)   // lookahead is whitespace or end-of-input

    |         // OR

  (?<=\s+|^)  // Lookbehind is whitespace or start of input FOLLOWED BY:
  \S            // One non-whitespace character
  (.*?[.?!])    // Any character sequence followed by punctuation
  (\s[.?!])*    // Any combination of whitespace and punctuation
  (?=\s+|$)     // Lookahead is whitespace or end-of-input
/g

*/

// Improvement 1: simplied version with ellipses
tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?(.*?[.?!…]|[^.?!…]+)(\s[.?!…])*["'’”'"\])}⟩]?(?=\s+|$)/g)

/*
  (?<=\s+|^)    // Lookbehind is whitespace or start-of-input FOLLOWED BY:
  [\"\'\‘\“\'\"\[\(\{\⟨]? // Any brackets or quotation characters (optional)
    (
      (.*?[.?!…])  // any sequence of characters followed by punctation
      |
      [^.?!…]+
    )
    (\s[.?!…])*  // any combination of whitespace and punctuation
  [\"\'\’\”\'\"\]\)\}\⟩]? // Any brackets or quotation characters (optional)
  (?=\s+|$)   // lookahead is whitespace or end-of-input
*/

// Improvement 2
let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?(.*?[.?!…]|.+)(\s[.?!…])*["'’”'"\])}⟩]?(?=\s+|$)/g)

/*
  (?<=\s+|^)    // Lookbehind is whitespace or start-of-input FOLLOWED BY:
  [\"\'\‘\“\'\"\[\(\{\⟨]? // Any brackets or quotation characters (optional)
    (
      (.*?[.?!…])  // any sequence of characters followed by punctation
      |
      .+ // if there is no punctuation to end at, be greedy
    )
    (\s[.?!…])*  // any combination of whitespace and punctuation
  [\"\'\’\”\'\"\]\)\}\⟩]? // Any brackets or quotation characters (optional)
  (?=\s+|$)   // lookahead is whitespace or end-of-input
*/
	// break string up in to sentences based on punctation and quotation marks
	var tokens = text.match(/(?<=\s+\|^)[\"\'\‘\“\'\"\[\(\{\⟨](.?[.?!])(\s[.?!])[\"\'\’\”\'\"\]\)\}\⟩](?=\s+\|$)\|(?<=\s+\|^)\S(.?[.?!])(\s[.?!])(?=\s+\|$)/g);

	/* Breakdown of the regular expression
	/
	(?<=\s+\|^) // Lookbehind is whitespace or start-of-input FOLLOWED BY:
	[\"\'\‘\“\'\"\[\(\{\⟨]? // Any brackets or quotation characters (optional)
	(.*?[.?!]) // any sequence of characters followed by punctation
	(\s[.?!])* // any combination of whitespace and punctuation
	[\"\'\’\”\'\"\]\)\}\⟩]? // Any brackets or quotation characters (optional)
	(?=\s+\|$) // lookahead is whitespace or end-of-input

	\| // OR

	(?<=\s+\|^) // Lookbehind is whitespace or start of input FOLLOWED BY:
	\S // One non-whitespace character
	(.*?[.?!]) // Any character sequence followed by punctuation
	(\s[.?!])* // Any combination of whitespace and punctuation
	(?=\s+\|$) // Lookahead is whitespace or end-of-input
	/g

	*/

	// Improvement 1: simplied version with ellipses
	tokens = text.match(/(?<=\s+\|^)["'‘“'"[({⟨]?(.?[.?!…]\|[^.?!…]+)(\s[.?!…])["'’”'"\])}⟩]?(?=\s+\|$)/g)

	/*
	(?<=\s+\|^) // Lookbehind is whitespace or start-of-input FOLLOWED BY:
	[\"\'\‘\“\'\"\[\(\{\⟨]? // Any brackets or quotation characters (optional)
	(
	(.*?[.?!…]) // any sequence of characters followed by punctation
	\|
	[^.?!…]+
	)
	(\s[.?!…])* // any combination of whitespace and punctuation
	[\"\'\’\”\'\"\]\)\}\⟩]? // Any brackets or quotation characters (optional)
	(?=\s+\|$) // lookahead is whitespace or end-of-input
	*/

	// Improvement 2
	let tokens = text.match(/(?<=\s+\|^)["'‘“'"[({⟨]?(.?[.?!…]\|.+)(\s[.?!…])["'’”'"\])}⟩]?(?=\s+\|$)/g)

	/*
	(?<=\s+\|^) // Lookbehind is whitespace or start-of-input FOLLOWED BY:
	[\"\'\‘\“\'\"\[\(\{\⟨]? // Any brackets or quotation characters (optional)
	(
	(.*?[.?!…]) // any sequence of characters followed by punctation
	\|
	.+ // if there is no punctuation to end at, be greedy
	)
	(\s[.?!…])* // any combination of whitespace and punctuation
	[\"\'\’\”\'\"\]\)\}\⟩]? // Any brackets or quotation characters (optional)
	(?=\s+\|$) // lookahead is whitespace or end-of-input
	*/