-
-
Save GRomR1/84e45fa25fb65e7fbd3a to your computer and use it in GitHub Desktop.
Regular Expression Sentence Tokenizer (English)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// tokenize(str) | |
// extracts semantically useful tokens from a string containing English-language sentences | |
// @param {String} the string to tokenize | |
// @returns {Array} containing extracted tokens | |
function tokenize(str) { | |
var punct = '\\[' + '\\!' + '\\"' + '\\#' + '\\$' + // since javascript does not | |
'\\%' + '\\&' + '\\\'' + '\\(' + '\\)' + // support POSIX character | |
'\\*' + '\\+' + '\\,' + '\\\\' + '\\-' + // classes, we'll need our | |
'\\.' + '\\/' + '\\:' + '\\;' + '\\<' + // own version of [:punct:] | |
'\\=' + '\\>' + '\\?' + '\\@' + '\\[' + | |
'\\]' + '\\^' + '\\_' + '\\`' + '\\{' + | |
'\\|' + '\\}' + '\\~' + '\\]', | |
re = new RegExp( // tokenizer | |
'\\s*' + // discard possible leading whitespace | |
'(' + // start capture group #1 | |
'\\.{3}' + // ellipsis (must appear before punct) | |
'|' + // alternator | |
'[А-Яа-я]+\\-[А-Яа-я]+' + | |
'|' + // alternator | |
'\\w+\\-\\w+' + // hyphenated words (must appear before punct) | |
'|' + // alternator | |
'\\w+\'(?:\\w+)?' + // compound words (must appear before punct) | |
'|' + // alternator | |
'[А-Яа-я]+' + // other words | |
'|' + // alternator | |
'\\w+' + // other words | |
'|' + // alternator | |
'[' + punct + ']' + // punct | |
')' // end capture group #1 | |
), | |
tokens=str.split(re), // split string using tokenizing regex | |
result=[]; | |
// add non-empty tokens to result | |
for(var i=0,len=tokens.length;i++<len;) { | |
if(tokens[i]) { | |
result.push(tokens[i]); | |
} | |
} | |
return result; | |
} // end tokenize() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment