Skip to content

Instantly share code, notes, and snippets.

@filipsalomonsson
Created June 13, 2010 09:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save filipsalomonsson/436518 to your computer and use it in GitHub Desktop.
Save filipsalomonsson/436518 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf8 -*-
"""Stemmers for Swedish and English
Implements the Swedish stemming algorithm used in snowball:
<http://snowball.tartarus.org/algorithms/swedish/stemmer.html>
Implements the English (Porter2) stemming algorithm used in snowball:
<http://snowball.tartarus.org/algorithms/english/stemmer.html>
"""
__author__ = "Filip Salomonsson <filip.salomonsson@gmail.com>"
import codecs
import re
import os
DEBUG = int(os.environ.get("DEBUG", 0))
def _debug(s, prefix=""):
if DEBUG: print >> sys.stderr, prefix, s
def _show(word, r1=None, r2=None):
if r1 is None: r1 = len(word)
if r2 is None: r2 = len(word)
return "%s[%s[%s]]" % (word[:r1], word[r1:r2], word[r2:])
class SwedishStemmer(object):
# Compile regular expression objects for s-ending and suffixes
_s_ending = "bcdfghjklmnoprtvy"
_s_ending = re.compile(ur"(%s)s$" % u"|".join(_s_ending))
_suffixes = ['heterna', 'hetens', 'anden', 'heten', 'heter', 'arnas',
'ernas', 'ornas', 'andes', 'arens', 'andet', 'arna',
'erna', 'orna', 'ande', 'arne', 'aste', 'aren', 'ades',
'erns', 'ade', 'are', 'ern', 'ens', 'het', 'ast', 'ad',
'en', 'ar', 'er', 'or', 'as', 'es', 'at', 'a', 'e']
_suffixes = re.compile(ur"(%s)$" % u"|".join(_suffixes))
def _base_and_region(self, word):
"""Split a (unicode) string into base and region. region corresponds
to R1 in the snowball algorithm description, and base is the part
preceding it."""
match = re.search(ur'^(.*?[aeiouyåäö][^aeiouyåäö])(.*)$', word)
if match:
# R1 is the region after the first non-vowel following a vowel[...]
base, region = match.groups()
#(R1 is adjusted so that the region before it contains at least
# 3 letters.)"
if len(base) < 3 and len(region) > 0:
cut_pos = 3 - len(base)
base = base + region[:cut_pos]
region = region[cut_pos:]
return base, region
else:
# [...] or is the null region at the end of the word if there
# is no such non-vowel.
return word, ''
def stem(self, word):
"""Perform stemming a word (as a unicode string) and
return the stem."""
base, region = self._base_and_region(word)
# Step 1:
# If suffix found in region, delete it...
region, n = re.subn(self._suffixes, "", region)
# ..else delete "s" if s-ending in word(!)
# NB: this searches in entire word, not just in region!
if n == 0 and re.search(self._s_ending, word) is not None:
region = region[:-1] # ta bort s:et
# Step 2:
# If suffix matches in region, delete the last letter.
def strip_last_char(match):
return match.group(0)[:-1]
region = re.sub(ur"([dg]d|nn|[dgkt]t)$", strip_last_char, region)
# Step 3:
# If -fullt och -löst in region, delete trailing t...
region, n = re.subn(ur"(full|lös)t$", r"\1", region)
# ...else, if region ends with -lig,-ig or -els, delete that suffix.
if n == 0:
region, n = re.subn(ur"(l?ig|els)$", "", region)
# We're done! Join base and region, and return the result as the stem.
return base + region
class EnglishStemmer(object):
_step_1a_invariants = set(["inning", "outing", "canning", "herring",
"earring", "proceed", "exceed", "succeed"])
_step_2_pairs = [('enci', 'ence'), ('anci', 'ance'),
('abli', 'able'),
('entli', 'ent'),
('izer', 'ize'), ('ization', 'ize'),
('ational', 'ate'), ('ation', 'ate'), ('ator', 'ate'),
('tional', 'tion'),
('alism', 'al'), ('aliti', 'al'), ('alli', 'al'),
('fulness', 'ful'),
('ousli', 'ous'), ('ousness', 'ous'),
('iveness', 'ive'), ('iviti', 'ive'),
('biliti', 'ble'), ('bli', 'ble'),
('fulli', 'ful'),
('lessli', 'less'),]
_step_3_pairs = [("ational", "ate"), ("tional", "tion"), ("alize", "al"),
("icate", "ic"), ("iciti", "ic"), ("ical", "ic"),
("ful", ""), ("ness", "")]
_step_4_suffixes = "al ance ence er ic able ible ant ement ment " \
"ent ism ate iti ous ive ize".split()
_step_4_suffixes = re.compile(r"(?:%s)$" % "|".join(_step_4_suffixes))
_li_ending = "cdeghkmnrt"
_doubles = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
_patterns = dict(vowel=r"[aeiouy]", nonvowel=r"[^aeiouy]",
nonvowelwxY=r"[^aeiouywxY]",)
_vc = re.compile(r"%(vowel)s %(nonvowel)s" % _patterns, re.VERBOSE)
_short_syllable = re.compile(r"""(
%(nonvowel)s %(vowel)s %(nonvowelwxY)s # (a)
|
^%(vowel)s %(nonvowel)s # (b)
)
$""" % _patterns, re.VERBOSE)
_short_word = re.compile(r"[^aeiouy]*" + _short_syllable.pattern, re.VERBOSE)
_exceptions = {"skis": "ski",
"skies": "sky",
"dying": "die", "lying": "lie", "tying": "tie",
"idly": "idl", "gently": "gentl", "singly": "singl",
"ugly": "ugli","early": "earli", "only": "onli",
"sky": None, "news": None, "howe": None, "atlas": None,
"cosmos": None, "bias": None, "andes": None,
}
def stem(self, word):
"""Perform stemming a word (as a unicode string) and
return the stem. Expects a lowercase word."""
# If the word has two letters or less, leave it as it is.
if len(word) <= 2: return word
if word in self._exceptions: return self._exceptions[word] or word
# Remove initial apostrophe, if present.
if word.startswith("'"): word = word[1:]
# Set initial 'y', or 'y' after a vowel, to 'Y'
word = re.sub(ur"^y|(?<=[aeiouy])y", u"Y", word)
# Establish the regions R1 and R2 (as indexes)
r1, r2 = self._regions(word)
_debug(_show(word, r1, r2), "*** Stemming:")
# Step 0: Remove longest matching suffix
word = re.sub(ur"'(s'?)?$", "", word)
_debug(_show(word, r1, r2), "After step 0:")
# Step 1a: Perform action for the longest matching suffix
if word.endswith("sses"):
# replace by 'ss'
word = word[:-2]
elif word.endswith("ied") or word.endswith("ies"):
# replace by 'i' if preceded by more than one letter...
if len(word) > 4:
word = word[:-2]
# ...otherwise by 'ie'
else:
word = word[:-1]
elif word.endswith("us") or word.endswith("ss"):
# do nothing
pass
elif word.endswith("s") and re.search(ur"[aeiouy]", word[:-2]):
# delete if the preceding part contains a vowel
# not immediately before the 's'
word = word[:-1]
_debug(_show(word, r1, r2), "After step 1a:")
# Following step 1a, leave the following invariant
if word in self._step_1a_invariants:
return word
# Step 1b: Perform action for the longest matching suffix
match = re.search(r"eed(ly)?$", word)
if match:
start = match.start()
# replace by 'ee' if in R1
if start >= r1: word = word[:start] + "ee"
else:
match = re.search(r"(ed|ing)(ly)?$", word)
if match:
# delete if preceding part contains a vowel...
if re.search(r"[aeiouy]", word[:match.start()]):
word = word[:match.start()]
# ...and then:
if word.endswith("at") or word.endswith("bl") \
or word.endswith("iz") or word.endswith("unxxx"):
word += u"e"
elif word[-2:] in self._doubles:
# split doubles
word = word[:-1]
elif self._short_word.match(word):
word += u"e"
_debug(_show(word, r1, r2), "After step 1b:")
# Step 1c:
word = re.sub(r"(?<=.[^aeiouy])[yY]$", "i", word)
_debug(_show(word, r1, r2), "After step 1c:")
# Step 2: For the longest matching suffix...
suff = word[r1:]
for find, repl in self._step_2_pairs:
if word.endswith(find):
# ...perform action if suffix is in R1
if suff.endswith(find):
word = word[:-len(find)] + repl
break
else:
if word[r1-1:].endswith("logi"): word = word[:-1]
elif suff.endswith("li") and word[-3] in self._li_ending:
word = word[:-2]
_debug(_show(word, r1, r2), "2")
# Step 3: Again, for the longest matching suffix...
suff = word[r1:]
for find, repl in self._step_3_pairs:
if suff.endswith(find):
word = word[:-len(find)] + repl
break
else:
if word[r2:].endswith("ative"):
word = word[:-5]
_debug(_show(word, r1, r2), "After step 3:")
# Step 4
match = self._step_4_suffixes.search(word)
if match:
start = match.start()
if start >= r2: word = word[:start]
else:
if word[r2:].endswith("ion") and word[-4] in "st":
word = word[:-3]
_debug(_show(word, r1, r2), "After step 4:")
# Step 5:
if word[r2:].endswith("e") or \
word[r1:].endswith("e") and not self._short_syllable.search(word[:-1]):
word = word[:-1]
elif word[r2-1:].endswith("ll"): word = word[:-1]
word = word.replace("Y", "y")
_debug(_show(word, r1, r2), "After step 5:")
return word
def _regions(self, word):
"""Calculate and return start positions for R1 and R2."""
end = len(word)
# R1 starts after the first VC...
match = self._vc.search(word)
if match: r1 = match.end()
else: r1 = end
# ...with these exceptions, where R1 starts after the given prefix.
for exception in ("gener", "commun"):
if word.startswith(exception):
r1 = len(exception)
# R2 starts after the first VC in R1
match = self._vc.search(word[r1:])
if match: r2 = r1 + match.end()
else: r2 = len(word)
return r1, r2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment