Skip to content

Instantly share code, notes, and snippets.

@chrisdc
Last active October 8, 2018 12:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chrisdc/8c608e67432a6cdd2d89aa7131ec133d to your computer and use it in GitHub Desktop.
Save chrisdc/8c608e67432a6cdd2d89aa7131ec133d to your computer and use it in GitHub Desktop.
A compact implementation of the Porter 2 stemming algorithm in Javascript
'use strict';
function handleSuffix(word, patterns) {
var wordLen = word.length;
for (var i = 0, len = patterns.length; i < len; i++) {
if (patterns[i][0].test(word)) {
if (wordLen - patterns[i][3] < patterns[i][2]) {
// The given suffix does not fit in the required region.
return word;
}
return word.replace(patterns[i][0], patterns[i][1]);
}
}
return word;
}
function isShort(word, r1) {
if (!word[r1]) {
return /(^[aeiouy][^aeiouy])|([aeiouy][^aeiouywxY]$)/.test(word);
}
return false;
}
function stemmer(word) {
const special = {
skis: 'ski',
skies: 'sky',
dying: 'die',
lying: 'lie',
tying: 'tie',
idly: 'idl',
gently: 'gentl',
ugly: 'ugli',
early: 'earli',
only: 'onli',
singly: 'singl'
};
const exceptions = [
'sky', 'news', 'howe', 'atlas', 'cosmos', 'bias', 'andes'
];
const exceptions1a = [
'inning', 'outing', 'canning', 'herring', 'earring', 'proceed', 'exceed', 'succeed'
];
const doubles = ['bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt'];
var r1, r2;
// 1 or 2 letter words (and some other exceptions) are returned unchanged.
if (word.length <= 2 || exceptions.indexOf(word) !== -1) {
return word;
}
// Some words have predefined stem forms.
if (special[word]) {
return special[word];
}
// Replace y
word = word.replace(/^y|([aeiouy])y/, '$1Y');
//Remove initial '
word = word.replace(/^'/, '');
//var {r1, r2} = findRegions(word);
// Find the regions
const regionRegex1 = /^(gener|commun|arsen)(\w*?[aeiouy][^aeiouy])?(\w*)$/;
const regionRegex2 = /^(\w*?[aeiouy][^aeiouy])(\w*?[aeiouy][^aeiouy])?(\w*)$/;
var match = regionRegex1.exec(word) || regionRegex2.exec(word) || [];
// R1
if (match[1] && match[1].length < word.length) {
r1 = match[1].length;
} else {
r1 = Infinity;
}
// R2
if (match[2] && match[1].length + match[2].length < word.length) {
r2 = match[1].length + match[2].length;
} else {
r2 = Infinity;
}
// Step 0
word = word.replace(/'s'$|'s$|'$/, '');
// Step 1a
word = handleSuffix(word, [
[/sses$/, 'ss'],
[/(\w{2,})(ie[d|s])$/, '$1i'],
[/(\ie[d|s])$/, 'ie'],
[/(us|ss)$/, '$1'],
[/([aeiouy]\w+)(s)$/, '$1']
]);
// Check for exceptions again.
if (exceptions1a.indexOf(word) !== -1) {
return word;
}
// Step 1b
var s1bHandler = function(match, p1) {
var result = p1;
var suffix = result.substring(result.length - 2);
if (['at', 'bl', 'iz'].indexOf(suffix) !== -1) {
result = result + 'e';
} else if (doubles.indexOf(suffix) !== -1) {
result = result.substring(0, result.length - 1);
} else if (isShort(result, r1)) {
result = result + 'e';
}
return result;
}
word = handleSuffix(word, [
[/eedly$/, 'ee', r1, 5],
[/(\w*[aeiouy]\w*)ingly$/, s1bHandler],
[/(\w*[aeiouy]\w*)edly$/, s1bHandler],
[/eed$/, 'ee', r1, 3],
[/(\w*[aeiouy]\w*)ing$/, s1bHandler],
[/(\w*[aeiouy]\w*)ed$/, s1bHandler]
]);
// Step 1c
word = word.replace(/(\w+[^aeiouy])([yY])$/, '$1i');
if (r1 !== Infinity) {
// Step 2
word = handleSuffix(word, [
[/ational$/, 'ate', r1, 7],
[/fulness$/, 'ful', r1, 7],
[/iveness$/, 'ive', r1, 7],
[/ization$/, 'ize', r1, 7],
[/ousness$/, 'ous', r1, 7],
[/biliti$/, 'ble', r1, 6],
[/lessli$/, 'less', r1, 6],
[/tional$/, 'tion', r1, 6],
[/(?:alism|aliti)$/, 'al', r1, 5],
[/ation$/, 'ate', r1, 5],
[/entli$/, 'ent', r1, 5],
[/fulli$/, 'ful', r1, 5],
[/iviti$/, 'ive', r1, 5],
[/ousli$/, 'ous', r1, 5],
[/abli$/, 'able', r1, 4],
[/alli$/, 'al', r1, 4],
[/anci$/, 'ance', r1, 4],
[/ator$/, 'ate', r1, 4],
[/enci$/, 'ence', r1, 4],
[/izer$/, 'ize', r1, 4],
[/bli$/, 'ble', r1, 3],
[/(l)ogi$/, '$1og', r1, 3],
[/([cdeghkmnrt])li$/, '$1', r1, 2]
]);
// Step 3
word = handleSuffix(word, [
[/ational$/, 'ate', r1, 7],
[/tional$/, 'tion', r1, 6],
[/alize$/, 'al', r1, 5],
[/ative$/, '', r2, 5],
[/(?:icate|iciti)$/, 'ic', r1, 5],
[/ical$/, 'ic', r1, 4],
[/ness$/, '', r1, 4],
[/ful$/, '', r1, 3]
]);
if (r2 !== Infinity) {
// Step 4
word = handleSuffix(word, [
[/ement$/, '', r2, 5],
[/(?:able|ance|ence|ible|ment)$/, '', r2, 4],
[/(?:ant|ate|ent|ism|iti|ive|ize|ous)$/, '', r2, 3],
[/([s|t])ion$/, '$1', r2, 3],
[/(?:al|er|ic)$/, '', r2, 2]
]);
}
// Step 5
var s5Handler = function(match, p1, offset, string) {
if (string.length - 1 >= r2) {
return p1 || '';
} else if (!p1 && string.length - 1 >= r1) {
return '';
}
return match;
}
word = handleSuffix(word, [
[/(^[aeiouy][^aeiouy]|[^aeiouy][aeiouy][^aeiouywxY])?e$/, s5Handler],
[/(l)l$/, '$1', r2, 1]
]);
}
// Reset Y
word = word.replace(/Y/, 'y');
return word;
}
module.exports = stemmer;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment