Skip to content

Instantly share code, notes, and snippets.

@GRomR1
Forked from raisch/regex_tokenizer.js
Last active March 18, 2016 12:47
Show Gist options
  • Save GRomR1/84e45fa25fb65e7fbd3a to your computer and use it in GitHub Desktop.
Save GRomR1/84e45fa25fb65e7fbd3a to your computer and use it in GitHub Desktop.
Regular Expression Sentence Tokenizer (English)
// tokenize(str)
// extracts semantically useful tokens from a string containing English-language sentences
// @param {String} the string to tokenize
// @returns {Array} containing extracted tokens
function tokenize(str) {
var punct = '\\[' + '\\!' + '\\"' + '\\#' + '\\$' + // since javascript does not
'\\%' + '\\&' + '\\\'' + '\\(' + '\\)' + // support POSIX character
'\\*' + '\\+' + '\\,' + '\\\\' + '\\-' + // classes, we'll need our
'\\.' + '\\/' + '\\:' + '\\;' + '\\<' + // own version of [:punct:]
'\\=' + '\\>' + '\\?' + '\\@' + '\\[' +
'\\]' + '\\^' + '\\_' + '\\`' + '\\{' +
'\\|' + '\\}' + '\\~' + '\\]',
re = new RegExp( // tokenizer
'\\s*' + // discard possible leading whitespace
'(' + // start capture group #1
'\\.{3}' + // ellipsis (must appear before punct)
'|' + // alternator
'[А-Яа-я]+\\-[А-Яа-я]+' +
'|' + // alternator
'\\w+\\-\\w+' + // hyphenated words (must appear before punct)
'|' + // alternator
'\\w+\'(?:\\w+)?' + // compound words (must appear before punct)
'|' + // alternator
'[А-Яа-я]+' + // other words
'|' + // alternator
'\\w+' + // other words
'|' + // alternator
'[' + punct + ']' + // punct
')' // end capture group #1
),
tokens=str.split(re), // split string using tokenizing regex
result=[];
// add non-empty tokens to result
for(var i=0,len=tokens.length;i++<len;) {
if(tokens[i]) {
result.push(tokens[i]);
}
}
return result;
} // end tokenize()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment