Created
August 8, 2017 15:03
-
-
Save vyasgiridhar/c219005a4432bd7d72092a7681c399a9 to your computer and use it in GitHub Desktop.
Yours Truly, NLTK Stemmer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm. | |
""" | |
from __future__ import unicode_literals | |
import re | |
from nltk.compat import python_2_unicode_compatible | |
@python_2_unicode_compatible | |
class LancasterStemmer(): | |
""" | |
Lancaster Stemmer | |
>>> st = LancasterStemmer() | |
>>> st.stem('maximum') # Remove "-um" when word is intact | |
'maxim' | |
>>> st.stem('presumably') # Don't remove "-um" when word is not intact | |
'presum' | |
>>> st.stem('multiply') # No action taken if word ends with "-ply" | |
'multiply' | |
>>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules | |
'provid' | |
>>> st.stem('owed') # Word starting with vowel must contain at least 2 letters | |
'ow' | |
>>> st.stem('ear') # ditto | |
'ear' | |
>>> st.stem('saying') # Words starting with consonant must contain at least 3 | |
'say' | |
>>> st.stem('crying') # letters and one of those letters must be a vowel | |
'cry' | |
>>> st.stem('string') # ditto | |
'string' | |
>>> st.stem('meant') # ditto | |
'meant' | |
>>> st.stem('cement') # ditto | |
'cem' | |
>>> st_pre = LancasterStemmer(strip_prefix_flag=True) | |
>>> st_pre.stem('kilometer') # Test Prefix | |
'met' | |
>>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t.")) | |
>>> st_custom.stem("ness") # Change s to t | |
'nest' | |
""" | |
# The rule list is static since it doesn't change between instances | |
default_rule_tuple = ( | |
"ai*2.", # -ia > - if intact | |
"a*1.", # -a > - if intact | |
"bb1.", # -bb > -b | |
"city3s.", # -ytic > -ys | |
"ci2>", # -ic > - | |
"cn1t>", # -nc > -nt | |
"dd1.", # -dd > -d | |
"dei3y>", # -ied > -y | |
"deec2ss.", # -ceed >", -cess | |
"dee1.", # -eed > -ee | |
"de2>", # -ed > - | |
"dooh4>", # -hood > - | |
"e1>", # -e > - | |
"feil1v.", # -lief > -liev | |
"fi2>", # -if > - | |
"gni3>", # -ing > - | |
"gai3y.", # -iag > -y | |
"ga2>", # -ag > - | |
"gg1.", # -gg > -g | |
"ht*2.", # -th > - if intact | |
"hsiug5ct.", # -guish > -ct | |
"hsi3>", # -ish > - | |
"i*1.", # -i > - if intact | |
"i1y>", # -i > -y | |
"ji1d.", # -ij > -id -- see nois4j> & vis3j> | |
"juf1s.", # -fuj > -fus | |
"ju1d.", # -uj > -ud | |
"jo1d.", # -oj > -od | |
"jeh1r.", # -hej > -her | |
"jrev1t.", # -verj > -vert | |
"jsim2t.", # -misj > -mit | |
"jn1d.", # -nj > -nd | |
"j1s.", # -j > -s | |
"lbaifi6.", # -ifiabl > - | |
"lbai4y.", # -iabl > -y | |
"lba3>", # -abl > - | |
"lbi3.", # -ibl > - | |
"lib2l>", # -bil > -bl | |
"lc1.", # -cl > c | |
"lufi4y.", # -iful > -y | |
"luf3>", # -ful > - | |
"lu2.", # -ul > - | |
"lai3>", # -ial > - | |
"lau3>", # -ual > - | |
"la2>", # -al > - | |
"ll1.", # -ll > -l | |
"mui3.", # -ium > - | |
"mu*2.", # -um > - if intact | |
"msi3>", # -ism > - | |
"mm1.", # -mm > -m | |
"nois4j>", # -sion > -j | |
"noix4ct.", # -xion > -ct | |
"noi3>", # -ion > - | |
"nai3>", # -ian > - | |
"na2>", # -an > - | |
"nee0.", # protect -een | |
"ne2>", # -en > - | |
"nn1.", # -nn > -n | |
"pihs4>", # -ship > - | |
"pp1.", # -pp > -p | |
"re2>", # -er > - | |
"rae0.", # protect -ear | |
"ra2.", # -ar > - | |
"ro2>", # -or > - | |
"ru2>", # -ur > - | |
"rr1.", # -rr > -r | |
"rt1>", # -tr > -t | |
"rei3y>", # -ier > -y | |
"sei3y>", # -ies > -y | |
"sis2.", # -sis > -s | |
"si2>", # -is > - | |
"ssen4>", # -ness > - | |
"ss0.", # protect -ss | |
"suo3>", # -ous > - | |
"su*2.", # -us > - if intact | |
"s*1>", # -s > - if intact | |
"s0.", # -s > -s | |
"tacilp4y.", # -plicat > -ply | |
"ta2>", # -at > - | |
"tnem4>", # -ment > - | |
"tne3>", # -ent > - | |
"tna3>", # -ant > - | |
"tpir2b.", # -ript > -rib | |
"tpro2b.", # -orpt > -orb | |
"tcud1.", # -duct > -duc | |
"tpmus2.", # -sumpt > -sum | |
"tpec2iv.", # -cept > -ceiv | |
"tulo2v.", # -olut > -olv | |
"tsis0.", # protect -sist | |
"tsi3>", # -ist > - | |
"tt1.", # -tt > -t | |
"uqi3.", # -iqu > - | |
"ugo1.", # -ogu > -og | |
"vis3j>", # -siv > -j | |
"vie0.", # protect -eiv | |
"vi2>", # -iv > - | |
"ylb1>", # -bly > -bl | |
"yli3y>", # -ily > -y | |
"ylp0.", # protect -ply | |
"yl2>", # -ly > - | |
"ygo1.", # -ogy > -og | |
"yhp1.", # -phy > -ph | |
"ymo1.", # -omy > -om | |
"ypo1.", # -opy > -op | |
"yti3>", # -ity > - | |
"yte3>", # -ety > - | |
"ytl2.", # -lty > -l | |
"yrtsi5.", # -istry > - | |
"yra3>", # -ary > - | |
"yro3>", # -ory > - | |
"yfi3.", # -ify > - | |
"ycn2t>", # -ncy > -nt | |
"yca3>", # -acy > - | |
"zi2>", # -iz > - | |
"zy1s." # -yz > -ys | |
) | |
def __init__(self): | |
"""Create an instance of the Lancaster stemmer. | |
""" | |
# Setup an empty rule dictionary - this will be filled in later | |
self.rule_dictionary = {} | |
# Check if a user wants to strip prefix | |
self._strip_prefix = True | |
# Check if a user wants to use his/her own rule tuples. | |
self._rule_tuple = self.default_rule_tuple | |
def parseRules(self, rule_tuple=None): | |
"""Validate the set of rules used in this stemmer. | |
If this function is called as an individual method, without using stem | |
method, rule_tuple argument will be compiled into self.rule_dictionary. | |
If this function is called within stem, self._rule_tuple will be used. | |
""" | |
# If there is no argument for the function, use class' own rule tuple. | |
rule_tuple = self._rule_tuple | |
valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$") | |
# Empty any old rules from the rule set before adding new ones | |
self.rule_dictionary = {} | |
for rule in rule_tuple: | |
if not valid_rule.match(rule): | |
raise ValueError("The rule {0} is invalid".format(rule)) | |
first_letter = rule[0:1] | |
if first_letter in self.rule_dictionary: | |
self.rule_dictionary[first_letter].append(rule) | |
else: | |
self.rule_dictionary[first_letter] = [rule] | |
def stem(self, word): | |
"""Stem a word using the Lancaster stemmer. | |
""" | |
# Lower-case the word, since all the rules are lower-cased | |
word = word.lower() | |
word = self.__stripPrefix(word) | |
# Save a copy of the original word | |
intact_word = word | |
# If rule dictionary is empty, parse rule tuple. | |
if not self.rule_dictionary: | |
self.parseRules() | |
return self.__doStemming(word, intact_word) | |
def __doStemming(self, word, intact_word): | |
"""Perform the actual word stemming | |
""" | |
valid_rule = re.compile("^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$") | |
proceed = True | |
while proceed: | |
# Find the position of the last letter of the word to be stemmed | |
last_letter_position = self.__getLastLetter(word) | |
# Only stem the word if it has a last letter and a rule matching that last letter | |
if last_letter_position < 0 or word[last_letter_position] not in self.rule_dictionary: | |
proceed = False | |
else: | |
rule_was_applied = False | |
# Go through each rule that matches the word's final letter | |
for rule in self.rule_dictionary[word[last_letter_position]]: | |
rule_match = valid_rule.match(rule) | |
if rule_match: | |
(ending_string, | |
intact_flag, | |
remove_total, | |
append_string, | |
cont_flag) = rule_match.groups() | |
# Convert the number of chars to remove when stemming | |
# from a string to an integer | |
remove_total = int(remove_total) | |
# Proceed if word's ending matches rule's word ending | |
if word.endswith(ending_string[::-1]): | |
if intact_flag: | |
if (word == intact_word and | |
self.__isAcceptable(word, remove_total)): | |
word = self.__applyRule(word, | |
remove_total, | |
append_string) | |
rule_was_applied = True | |
if cont_flag == '.': | |
proceed = False | |
break | |
elif self.__isAcceptable(word, remove_total): | |
word = self.__applyRule(word, | |
remove_total, | |
append_string) | |
rule_was_applied = True | |
if cont_flag == '.': | |
proceed = False | |
break | |
# If no rules apply, the word doesn't need any more stemming | |
if rule_was_applied == False: | |
proceed = False | |
return word | |
def __getLastLetter(self, word): | |
"""Get the zero-based index of the last alphabetic character in this string | |
""" | |
last_letter = -1 | |
for position in range(len(word)): | |
if word[position].isalpha(): | |
last_letter = position | |
else: | |
break | |
return last_letter | |
def __isAcceptable(self, word, remove_total): | |
"""Determine if the word is acceptable for stemming. | |
""" | |
word_is_acceptable = False | |
# If the word starts with a vowel, it must be at least 2 | |
# characters long to be stemmed | |
if word[0] in "aeiouy": | |
if (len(word) - remove_total >= 2): | |
word_is_acceptable = True | |
# If the word starts with a consonant, it must be at least 3 | |
# characters long (including one vowel) to be stemmed | |
elif (len(word) - remove_total >= 3): | |
if word[1] in "aeiouy": | |
word_is_acceptable = True | |
elif word[2] in "aeiouy": | |
word_is_acceptable = True | |
return word_is_acceptable | |
def __applyRule(self, word, remove_total, append_string): | |
"""Apply the stemming rule to the word | |
""" | |
# Remove letters from the end of the word | |
new_word_length = len(word) - remove_total | |
word = word[0:new_word_length] | |
# And add new letters to the end of the truncated word | |
if append_string: | |
word += append_string | |
return word | |
def __stripPrefix(self, word): | |
"""Remove prefix from a word. | |
This function originally taken from Whoosh. | |
""" | |
for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega", | |
"nano", "pico", "pseudo"): | |
if word.startswith(prefix): | |
return word[len(prefix):] | |
return word | |
def __repr__(self): | |
return '<LancasterStemmer>' | |
if __name__ == "__main__": | |
st = LancasterStemmer() | |
print (st.stem('crying')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment