Last active
October 15, 2020 11:31
-
-
Save loretoparisi/f34a956a29091c3ff62507fda8d7efba to your computer and use it in GitHub Desktop.
Unicode aware Regex Tokenizer in JavaScript with token char offset begin and end
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function aggressive_tokenizer(text) { | |
// most punctuation | |
text = text.replace(/[^\w\.\-\/\+\<\>,&]/g, " $& "); | |
// commas if followed by space | |
text = text.replace(/(,\s)/g, " $1"); | |
// single quotes if followed by a space | |
text = text.replace(/('\s)/g, " $1"); | |
// single quotes if last char | |
text = text.replace(/('$)/, " $1"); | |
text = text.replace(/(\s+[`'"‘])(\w+)\b(?!\2)/g, " $2"); | |
// periods before newline or end of string | |
text = text.replace(/\. *(\n|$)/g, " . "); | |
// replace punct | |
text = text.replace(/[\\?\^%<>=!&|+\~]/g, ""); | |
text = text.replace(/[…;,.:*#\)\({}\[\]]/g, ""); | |
// finally split remainings into words | |
text = text.split(/\s+/); | |
text = text.filter(a => a && a != "" && a.trim() != ""); | |
return text; | |
} | |
text = "점점 더 깊이 끌려가" | |
var tokens = aggressive_tokenizer(text); | |
console.dir(tokens, { depth: null, maxArrayLength: null }); | |
var seen = new Map(); | |
var indexes = tokens.map(token => { // for each token | |
let item = { | |
"word": token | |
} | |
try { | |
var pattern; | |
// word boundary | |
pattern = '(?<=^|\\PL)$1(?=\\PL|$)'; | |
// word boundary between a non-letter + non-mark | |
pattern = '(?<!\\pL\\pM*)$1(?!\\pL)'; | |
// word boundary between a non-letter + non-mark V2 | |
//pattern = '(?<!\\p{L}\\p{M}*)$1(?!\\p{L})'; | |
var escaped = token.replace(/[\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&"); | |
var wordRegex = new RegExp(pattern.replace('$1', escaped), "g"); | |
// calculate token begin end | |
var match = null; | |
while ((match = wordRegex.exec(text)) !== null) { | |
if (match.index > (seen.get(token) || -1)) { | |
var wordStart = match.index; | |
var wordEnd = wordStart + token.length - 1; | |
item.characterOffsetBegin = wordStart; | |
item.characterOffsetEnd = wordEnd; | |
seen.set(token, wordEnd); | |
break; | |
} | |
} | |
} catch (error) { | |
console.error(token, error); | |
} | |
return item; | |
}); | |
indexes.forEach(index => { | |
if (!index.characterOffsetBegin && !index.characterOffsetEnd) { | |
console.log("MISSING INDEXES " + index.word); | |
} else if (index.word != text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1)) { | |
console.log("NOT MATCHING!!! " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1)) | |
} else { | |
console.log("\tMATCHED " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1)) | |
} | |
}); |
Another issue happens when using mixed Unicode
+ ASCII
:
text = "Love in this house 점점 더 깊이 끌려가 there's love"
Using pattern = '(?<!\\p{L}\\p{M}*)$1(?!\\p{L})';
and RegExp(pattern.replace('$1', escaped), "g");
the outcome will be
[
'Wolves', 'in', 'this',
'house', '점', '점',
'더', '깊', '이',
'끌', '려', '가',
'there', "'", 's',
'love'
]
MISSING INDEXES 점
not only missing char offsets, but event wrong ones
[
{
"word": "Wolves",
"characterOffsetBegin": 0,
"characterOffsetEnd": 6
},
{
"word": "in",
"characterOffsetBegin": 7,
"characterOffsetEnd": 9
},
{
"word": "this",
"characterOffsetBegin": 10,
"characterOffsetEnd": 14
},
{
"word": "house",
"characterOffsetBegin": 15,
"characterOffsetEnd": 20
},
{
"word": "점",
"characterOffsetBegin": 21,
"characterOffsetEnd": 22
},
{
"word": "점"
},
{
"word": "더",
"characterOffsetBegin": 24,
"characterOffsetEnd": 25
},
{
"word": "깊",
"characterOffsetBegin": 26,
"characterOffsetEnd": 27
},
{
"word": "이",
"characterOffsetBegin": 27,
"characterOffsetEnd": 28
},
{
"word": "끌",
"characterOffsetBegin": 29,
"characterOffsetEnd": 30
},
{
"word": "려",
"characterOffsetBegin": 30,
"characterOffsetEnd": 31
},
{
"word": "가",
"characterOffsetBegin": 31,
"characterOffsetEnd": 32
},
{
"word": "there",
"characterOffsetBegin": 33,
"characterOffsetEnd": 38
},
{
"word": "'",
"characterOffsetBegin": 38,
"characterOffsetEnd": 39
},
{
"word": "s",
"characterOffsetBegin": 5,
"characterOffsetEnd": 6
},
{
"word": "love",
"characterOffsetBegin": 41,
"characterOffsetEnd": 45
}
]
so token s
in There's
will have the wrong characterOffsetBegin
and characterOffsetEnd
:
{
"word": "s",
"characterOffsetBegin": 5,
"characterOffsetEnd": 6
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The best solution so far is the using
pattern = '(?<!\\pL\\pM*)$1(?!\\pL)';
andnew RegExp(pattern.replace('$1', escaped), "g");
Attempts:
Using
pattern = '(?<!\\p{L}\\p{M}*)$1(?!\\p{L})';
andnew RegExp(pattern.replace('$1', escaped), "gu");
Using
pattern = '(?<!\\pL\\pM*)$1(?!\\pL)';
andnew RegExp(pattern.replace('$1', escaped), "g");
Using
pattern = '(?<=^|\\PL)$1(?=\\PL|$)';
andnew RegExp(pattern.replace('$1', escaped), "g");
Using
pattern = '(?<!\\pL\\pM*)$1(?!\\pL)';
andnew RegExp(pattern.replace('$1', escaped), "g");