loretoparisi/tokenizer_unicode.js

## tokenizer_unicode.js
function aggressive_tokenizer(text) {
    // most punctuation
    text = text.replace(/[^\w\.\-\/\+\<\>,&]/g, " $& ");
    // commas if followed by space
    text = text.replace(/(,\s)/g, " $1");
    // single quotes if followed by a space
    text = text.replace(/('\s)/g, " $1");
    // single quotes if last char
    text = text.replace(/('$)/, " $1");
    text = text.replace(/(\s+[`'"‘])(\w+)\b(?!\2)/g, " $2");
    // periods before newline or end of string
    text = text.replace(/\. *(\n|$)/g, " . ");
    // replace punct
    text = text.replace(/[\\?\^%<>=!&|+\~]/g, "");
    text = text.replace(/[…;,.:*#\)\({}\[\]]/g, "");
    // finally split remainings into words
    text = text.split(/\s+/);
    text = text.filter(a => a && a != "" && a.trim() != "");
    return text;
  }

  text = "점점 더 깊이 끌려가"

  var tokens = aggressive_tokenizer(text);

  console.dir(tokens, { depth: null, maxArrayLength: null });

  var seen = new Map();
  var indexes = tokens.map(token => { // for each token
    let item = {
      "word": token
    }

    try {
      var pattern;

      // word boundary
      pattern = '(?<=^|\\PL)$1(?=\\PL|$)';

      // word boundary between a non-letter + non-mark
      pattern = '(?<!\\pL\\pM*)$1(?!\\pL)';
      // word boundary between a non-letter + non-mark V2
      //pattern = '(?<!\\p{L}\\p{M}*)$1(?!\\p{L})';

      var escaped = token.replace(/[\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
      var wordRegex = new RegExp(pattern.replace('$1', escaped), "g");
      // calculate token begin end
      var match = null;
      while ((match = wordRegex.exec(text)) !== null) {
        if (match.index > (seen.get(token) || -1)) {
          var wordStart = match.index;
          var wordEnd = wordStart + token.length - 1;
          item.characterOffsetBegin = wordStart;
          item.characterOffsetEnd = wordEnd;
          seen.set(token, wordEnd);
          break;
        }
      }
    } catch (error) {
      console.error(token, error);
    }
    return item;
  });
  indexes.forEach(index => {
    if (!index.characterOffsetBegin && !index.characterOffsetEnd) {
      console.log("MISSING INDEXES " + index.word);
    } else if (index.word != text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1)) {
      console.log("NOT MATCHING!!! " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
    } else {
      console.log("\tMATCHED " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
    }
  });
	function aggressive_tokenizer(text) {
	// most punctuation
	text = text.replace(/[^\w\.\-\/\+\<\>,&]/g, " $& ");
	// commas if followed by space
	text = text.replace(/(,\s)/g, " $1");
	// single quotes if followed by a space
	text = text.replace(/('\s)/g, " $1");
	// single quotes if last char
	text = text.replace(/('$)/, " $1");
	text = text.replace(/(\s+[`'"‘])(\w+)\b(?!\2)/g, " $2");
	// periods before newline or end of string
	text = text.replace(/\. *(\n\|$)/g, " . ");
	// replace punct
	text = text.replace(/[\\?\^%<>=!&\|+\~]/g, "");
	text = text.replace(/[…;,.:*#\)\({}\[\]]/g, "");
	// finally split remainings into words
	text = text.split(/\s+/);
	text = text.filter(a => a && a != "" && a.trim() != "");
	return text;
	}

	text = "점점 더 깊이 끌려가"

	var tokens = aggressive_tokenizer(text);

	console.dir(tokens, { depth: null, maxArrayLength: null });

	var seen = new Map();
	var indexes = tokens.map(token => { // for each token
	let item = {
	"word": token
	}

	try {
	var pattern;

	// word boundary
	pattern = '(?<=^\|\\PL)$1(?=\\PL\|$)';

	// word boundary between a non-letter + non-mark
	pattern = '(?<!\\pL\\pM*)$1(?!\\pL)';
	// word boundary between a non-letter + non-mark V2
	//pattern = '(?<!\\p{L}\\p{M}*)$1(?!\\p{L})';

	var escaped = token.replace(/[\[\]{}()*+?.,\\\^$\|#\s]/g, "\\$&");
	var wordRegex = new RegExp(pattern.replace('$1', escaped), "g");
	// calculate token begin end
	var match = null;
	while ((match = wordRegex.exec(text)) !== null) {
	if (match.index > (seen.get(token) \|\| -1)) {
	var wordStart = match.index;
	var wordEnd = wordStart + token.length - 1;
	item.characterOffsetBegin = wordStart;
	item.characterOffsetEnd = wordEnd;
	seen.set(token, wordEnd);
	break;
	}
	}
	} catch (error) {
	console.error(token, error);
	}
	return item;
	});
	indexes.forEach(index => {
	if (!index.characterOffsetBegin && !index.characterOffsetEnd) {
	console.log("MISSING INDEXES " + index.word);
	} else if (index.word != text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1)) {
	console.log("NOT MATCHING!!! " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
	} else {
	console.log("\tMATCHED " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
	}
	});