Skip to content

Instantly share code, notes, and snippets.

@bryanbibat
Created June 1, 2012 05:31
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bryanbibat/2849130 to your computer and use it in GitHub Desktop.
Save bryanbibat/2849130 to your computer and use it in GitHub Desktop.
Custom implementation of the Porter2 stemming algorithm
class Stemmer
# Custom implementation of the Porter2 stemming algorithm
# http://snowball.tartarus.org/algorithms/english/stemmer.html
@getStems: (text) ->
exclusionList = ["", ",", "/", "&", "of", "the", "by", "a", "*", "-", "'", "'s", "=", ">", "s'", "\"", "~", "and", "with", "for", "in", "500ml", "to", "at", "or", "n", "x", "pcs"]
stems = (@stem(word) for word in text.split /[ ,()!:@\/]/)
(word for word in stems when word not in exclusionList and word.search(/^[\d]+(\.[\d]+)?"?$/) == -1)
@stem: (word) ->
word = @prepareWord(word)
return @finalStem(word) if @returnImmediately(word)
word = @changeY(word)
startR1 = @getStartR1(word)
startR2 = @getStartR2(word, startR1)
word = @removeApostrophe(word)
word = @doStep1A(word)
word = @doStep1B(word, startR1)
word = @doStep1C(word)
word = @doStep2(word, startR1)
word = @doStep3(word, startR1, startR2)
word = @doStep4(word, startR2)
word = @doStep5(word, startR1, startR2)
@finalStem(word)
@finalStem: (word) ->
word = word.replace(/Y/g, "y")
word.replace(/'/, "")
@prepareWord: (word) ->
word = word.toLowerCase()
word = word.replace /^"(.*)"$/, "$1"
word = word.replace /^'(.*)'$/, "$1"
word = word.replace /^(.*)[\.-]$/, "$1"
word = word.replace /\\s/g , ""
word = word.replace /´/g, "'"
word = word.replace /[àáâãäå]/g, "a"
word = word.replace /æ/g, "ae"
word = word.replace /ç/g, "c"
word = word.replace /[èéêë]/g, "e"
word = word.replace /[ìíîï]/g, "i"
word = word.replace /ñ/g, "n"
word = word.replace /[òóôõö]/g, "o"
word = word.replace /œ/g, "oe"
word = word.replace /[ùúûü]/g, "u"
word = word.replace /[ýÿ]/g, "y"
word = word.replace /\\W/g, ""
word
@returnImmediately: (word) ->
return true if word.length <= 2
return true if word.search(/[^\w']/) > -1
false
@getStartR1: (word) ->
startR1 = word.search(/[aeiouy][^aeiouy]/)
if startR1 == -1 then word.length else startR1 + 2
@getStartR2: (word, startR1) ->
return startR1 if startR1 == word.length
r1 = word.slice(startR1)
startR2 = r1.search(/[aeiouy][^aeiouy]/)
if startR2 == -1 then word.length else startR1 + startR2 + 2
@changeY: (word) ->
return word if word.indexOf("y") == -1
word = "Y" + word.slice(1) if word.charAt(0) == "y"
word.replace(/([aeiou])y/g, "$1Y")
@removeApostrophe: (word) ->
match = word.match /^(\w*)('s?)$/
return word if match == null
match[1]
@doStep1A: (word) ->
if word.match /sses$/
return word.replace /(\w*)sses$/, "$1ss"
if word.match /(\w*)(ied|ies)$/
if word.match(/(\w*)(ied|ies)$/)[1].length > 1
return word.replace /(\w*)(ied|ies)$/, "$1i"
else
return word.replace /(\w*)(ied|ies)$/, "$1ie"
return word if word.match(/(\w*)(u|s)s$/)
if word.match(/\w*?[aeiouy]\w+s$/)
return word.slice(0, word.length - 1)
word
@doStep1B: (word, startR1) ->
if word.search(/(eed|eedly)$/) >= startR1
return word.replace(/(\w*)(eed|eedly)/, "$1ee")
if word.match(/\w*?[aeiouy]\w+(ed|edly|ing|ingly)$/)
word = word.match(/^(\w*?[aeiouy]\w+)(ed|edly|ing|ingly)$/)[1]
return word + "e" if word.match(/(at|bl|iz)$/)
if word.match(/(bb|dd|ff|gg|mm|nn|pp|rr|tt)$/)
return word.slice(0, word.length - 1)
return word + "e" if @isShort(word, startR1)
word
@doStep1C: (word) ->
word.replace /(\w+[^aeiouy])(y|Y)$/, "$1i"
@doStep2: (word, startR1) ->
if word.search(/ational$/) >= startR1
return word.replace /(\w*)ational$/, "$1ate"
if word.search(/tional$/) >= startR1
return word.replace /(\w*)tional$/, "$1tion"
if word.search(/ization$/) >= startR1
return word.replace /(\w*)ization$/, "$1ize"
if word.search(/(ation|ator)$/) >= startR1
return word.replace /(\w*)(ation|ator)$/, "$1ate"
if word.search(/(alism|aliti|alli)$/) >= startR1
return word.replace /(\w*)(alism|aliti|alli)$/, "$1al"
if word.search(/enci$/) >= startR1
return word.replace /(\w*)enci$/, "$1ence"
if word.search(/anci$/) >= startR1
return word.replace /(\w*)anci$/, "$1ance"
if word.search(/abli$/) >= startR1
return word.replace /(\w*)abli$/, "$1able"
if word.search(/entli$/) >= startR1
return word.replace /(\w*)entli$/, "$1ent"
if word.search(/fulness$/) >= startR1
return word.replace /(\w*)fulness$/, "$1ful"
if word.search(/(ousli|ousness)$/) >= startR1
return word.replace /(\w*)(ousli|ousness)$/, "$1ous"
if word.search(/(iveness|iviti)$/) >= startR1
return word.replace /(\w*)(iveness|iviti)$/, "$1ive"
if word.search(/(biliti|bli)$/) >= startR1
return word.replace /(\w*)(biliti|bli)$/, "$1ble"
if word.search(/logi$/) >= startR1
return word.replace /(\w*l)ogi$/, "$1og"
if word.search(/fulli$/) >= startR1
return word.replace /(\w*)fulli$/, "$1ful"
if word.search(/lessli$/) >= startR1
return word.replace /(\w*)lessli$/, "$1less"
if word.search(/[cdeghkmnrt]li$/) >= startR1
return word.replace /(\w*)li$/, "$1"
word
@doStep3: (word, startR1, startR2) ->
if word.search(/ational$/) >= startR1
return word.replace /(\w*)ational$/, "$1ate"
if word.search(/tional$/) >= startR1
return word.replace /(\w*)tional$/, "$1tion"
if word.search(/alize$/) >= startR1
return word.replace /(\w*)alize$/, "$1al"
if word.search(/(icate|iciti|ical)$/) >= startR1
return word.replace /(\w*)(icate|iciti|ical)$/, "$1ic"
if word.search(/(ful|ness)$/) >= startR1
return word.replace /(\w*)(ful|ness)$/, "$1"
if word.search(/ative$/) >= startR2
return word.replace /(\w*)ative$/, "$1"
word
@doStep4: (word, startR2) ->
if word.search(/ement$/) >= startR2
return word.replace /(\w*)ement$/, "$1"
if word.search(/ment$/) >= startR2
return word.replace /(\w*)ment$/, "$1"
if word.search(/(al|ance|ence|er|ic|able|ible|ant|ent|ism|ate|iti|ous|ive|ize)$/) >= startR2
return word.replace /(\w*)(al|ance|ence|er|ic|able|ible|ant|ent|ism|ate|iti|ous|ive|ize)$/, "$1"
if word.search(/(s|t)ion$/) >= startR2
return word.replace /(\w*)(s|t)ion$/, "$1"
word
@doStep5: (word, startR1, startR2) ->
if word.search(/e$/) >= startR2
return word.slice(0, word.length - 1)
if word.search(/e$/) >= startR1 and (not @isShort(word.match(/(\w*)e$/)[1], startR1))
return word.slice(0, word.length - 1)
if word.search(/ll$/) >= startR2
return word.slice(0, word.length - 1)
word
@isShort: (word, startR1) ->
word.match(/^([aeouiy][^aeouiy]|\w*[^aeiouy][aeouiy][^aeouiyYwx])$/) != null and startR1 >= word.length
exports =
Stemmer: Stemmer
module.exports = exports if module?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment