filipsalomonsson/gist:436518

## gistfile1.pyw
#!/usr/bin/env python
# -*- coding: utf8 -*-
"""Stemmers for Swedish and English

Implements the Swedish stemming algorithm used in snowball:
<http://snowball.tartarus.org/algorithms/swedish/stemmer.html>

Implements the English (Porter2) stemming algorithm used in snowball:
<http://snowball.tartarus.org/algorithms/english/stemmer.html>
"""

__author__ = "Filip Salomonsson <filip.salomonsson@gmail.com>"

import codecs
import re
import os

DEBUG = int(os.environ.get("DEBUG", 0))
def _debug(s, prefix=""):
    if DEBUG: print >> sys.stderr, prefix, s

def _show(word, r1=None, r2=None):
    if r1 is None: r1 = len(word)
    if r2 is None: r2 = len(word)
    return "%s[%s[%s]]" % (word[:r1], word[r1:r2], word[r2:])


class SwedishStemmer(object):
    # Compile regular expression objects for s-ending and suffixes
    _s_ending = "bcdfghjklmnoprtvy"
    _s_ending = re.compile(ur"(%s)s$" % u"|".join(_s_ending))

    _suffixes = ['heterna', 'hetens', 'anden', 'heten', 'heter', 'arnas',
                'ernas', 'ornas', 'andes', 'arens', 'andet', 'arna',
                'erna', 'orna', 'ande', 'arne', 'aste', 'aren', 'ades',
                'erns', 'ade', 'are', 'ern', 'ens', 'het', 'ast', 'ad',
                'en', 'ar', 'er', 'or', 'as', 'es', 'at', 'a', 'e']
    _suffixes = re.compile(ur"(%s)$" % u"|".join(_suffixes))

    def _base_and_region(self, word):
        """Split a (unicode) string into base and region. region corresponds
        to R1 in the snowball algorithm description, and base is the part
        preceding it."""

        match = re.search(ur'^(.*?[aeiouyåäö][^aeiouyåäö])(.*)$', word)
        if match:
            # R1 is the region after the first non-vowel following a vowel[...]
            base, region = match.groups()

            #(R1 is adjusted so that the region before it contains at least
            # 3 letters.)"
            if len(base) < 3 and len(region) > 0:
                cut_pos = 3 - len(base)
                base = base + region[:cut_pos]
                region = region[cut_pos:]
            return base, region

        else:
            # [...] or is the null region at the end of the word if there
            # is no such non-vowel.
            return word, ''

    def stem(self, word):
        """Perform stemming a word (as a unicode string) and
        return the stem."""

        base, region = self._base_and_region(word)

        # Step 1:
        # If suffix found in region, delete it...
        region, n = re.subn(self._suffixes, "", region)

        # ..else delete "s" if s-ending in word(!)
        # NB: this searches in entire word, not just in region!
        if n == 0 and re.search(self._s_ending, word) is not None:
            region = region[:-1] # ta bort s:et

        # Step 2:
        # If suffix matches in region, delete the last letter.
        def strip_last_char(match):
            return match.group(0)[:-1]
        region = re.sub(ur"([dg]d|nn|[dgkt]t)$", strip_last_char, region)

        # Step 3:
        # If -fullt och -löst in region, delete trailing t...
        region, n = re.subn(ur"(full|lös)t$", r"\1", region)
        # ...else, if region ends with -lig,-ig or -els, delete that suffix.
        if n == 0:
            region, n = re.subn(ur"(l?ig|els)$", "", region)

        # We're done! Join base and region, and return the result as the stem.
        return base + region


class EnglishStemmer(object):

    _step_1a_invariants = set(["inning", "outing", "canning", "herring",
                               "earring", "proceed", "exceed", "succeed"])

    _step_2_pairs = [('enci', 'ence'), ('anci', 'ance'),
                    ('abli', 'able'),
                    ('entli', 'ent'),
                    ('izer', 'ize'), ('ization', 'ize'),
                    ('ational', 'ate'), ('ation', 'ate'), ('ator', 'ate'),
                    ('tional', 'tion'),
                    ('alism', 'al'), ('aliti', 'al'), ('alli', 'al'),
                    ('fulness', 'ful'),
                    ('ousli', 'ous'), ('ousness', 'ous'),
                    ('iveness', 'ive'), ('iviti', 'ive'),
                    ('biliti', 'ble'), ('bli', 'ble'),
                    ('fulli', 'ful'),
                    ('lessli', 'less'),]

    _step_3_pairs = [("ational", "ate"), ("tional", "tion"), ("alize", "al"),
                    ("icate", "ic"), ("iciti", "ic"), ("ical", "ic"),
                    ("ful", ""), ("ness", "")]

    _step_4_suffixes = "al ance ence er ic able ible ant ement ment " \
                      "ent ism ate iti ous ive ize".split()
    _step_4_suffixes = re.compile(r"(?:%s)$" % "|".join(_step_4_suffixes))

    _li_ending = "cdeghkmnrt"
    _doubles = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")

    _patterns = dict(vowel=r"[aeiouy]", nonvowel=r"[^aeiouy]",
                     nonvowelwxY=r"[^aeiouywxY]",)

    _vc = re.compile(r"%(vowel)s %(nonvowel)s" % _patterns, re.VERBOSE)
    _short_syllable = re.compile(r"""(
                                     %(nonvowel)s %(vowel)s %(nonvowelwxY)s # (a)
                                     |
                                     ^%(vowel)s %(nonvowel)s # (b)
                                    )
                                    $""" % _patterns, re.VERBOSE)

    _short_word = re.compile(r"[^aeiouy]*" + _short_syllable.pattern, re.VERBOSE)


    _exceptions = {"skis": "ski",
                  "skies": "sky",
                  "dying": "die", "lying": "lie", "tying": "tie",
                  "idly": "idl", "gently": "gentl", "singly": "singl",
                  "ugly": "ugli","early": "earli", "only": "onli",
                  "sky": None, "news": None, "howe": None, "atlas": None,
                  "cosmos": None, "bias": None, "andes": None,
                  }

    def stem(self, word):
        """Perform stemming a word (as a unicode string) and
        return the stem. Expects a lowercase word."""

        # If the word has two letters or less, leave it as it is.
        if len(word) <= 2: return word
        if word in self._exceptions: return self._exceptions[word] or word
        # Remove initial apostrophe, if present.
        if word.startswith("'"): word = word[1:]

        # Set initial 'y', or 'y' after a vowel, to 'Y'
        word = re.sub(ur"^y|(?<=[aeiouy])y", u"Y", word)

        # Establish the regions R1 and R2 (as indexes)
        r1, r2 = self._regions(word)
        _debug(_show(word, r1, r2), "*** Stemming:")


        # Step 0: Remove longest matching suffix
        word = re.sub(ur"'(s'?)?$", "", word)
        _debug(_show(word, r1, r2), "After step 0:")


        # Step 1a: Perform action for the longest matching suffix
        if word.endswith("sses"):
            # replace by 'ss'
            word = word[:-2]
        elif word.endswith("ied") or word.endswith("ies"):
            # replace by 'i' if preceded by more than one letter...
            if len(word) > 4:
                word = word[:-2]
            # ...otherwise by 'ie'
            else:
                word = word[:-1]
        elif word.endswith("us") or word.endswith("ss"):
            # do nothing
            pass
        elif word.endswith("s") and re.search(ur"[aeiouy]", word[:-2]):
            # delete if the preceding part contains a vowel
            # not immediately before the 's'
            word = word[:-1]
        _debug(_show(word, r1, r2), "After step 1a:")

        # Following step 1a, leave the following invariant
        if word in self._step_1a_invariants:
            return word


        # Step 1b: Perform action for the longest matching suffix
        match = re.search(r"eed(ly)?$", word)
        if match:
            start = match.start()
            # replace by 'ee' if in R1
            if start >= r1: word = word[:start] + "ee"
        else:
            match = re.search(r"(ed|ing)(ly)?$", word)
            if match:
                # delete if preceding part contains a vowel...
                if re.search(r"[aeiouy]", word[:match.start()]):
                    word = word[:match.start()]
                    # ...and then:
                    if word.endswith("at") or word.endswith("bl") \
                       or word.endswith("iz") or word.endswith("unxxx"):
                        word += u"e"
                    elif word[-2:] in self._doubles:
                        # split doubles
                        word = word[:-1]
                    elif self._short_word.match(word):
                        word += u"e"
        _debug(_show(word, r1, r2), "After step 1b:")

        # Step 1c:
        word = re.sub(r"(?<=.[^aeiouy])[yY]$", "i", word)
        _debug(_show(word, r1, r2), "After step 1c:")


        # Step 2: For the longest matching suffix...
        suff = word[r1:]
        for find, repl in self._step_2_pairs:
            if word.endswith(find):
                # ...perform action if suffix is in R1
                if suff.endswith(find):
                    word = word[:-len(find)] + repl
                break
        else:
            if word[r1-1:].endswith("logi"): word = word[:-1]
            elif suff.endswith("li") and word[-3] in self._li_ending:
                word = word[:-2]
        _debug(_show(word, r1, r2), "2")


        # Step 3: Again, for the longest matching suffix...
        suff = word[r1:]
        for find, repl in self._step_3_pairs:
            if suff.endswith(find):
                word = word[:-len(find)] + repl
                break
        else:
            if word[r2:].endswith("ative"):
                word = word[:-5]
        _debug(_show(word, r1, r2), "After step 3:")


        # Step 4
        match = self._step_4_suffixes.search(word)
        if match:
            start = match.start()
            if start >= r2: word = word[:start]
        else:
            if word[r2:].endswith("ion") and word[-4] in "st":
                word = word[:-3]
        _debug(_show(word, r1, r2), "After step 4:")

        # Step 5:
        if word[r2:].endswith("e") or \
           word[r1:].endswith("e") and not self._short_syllable.search(word[:-1]):
            word = word[:-1]
        elif word[r2-1:].endswith("ll"): word = word[:-1]

        word = word.replace("Y", "y")
        _debug(_show(word, r1, r2), "After step 5:")

        return word

    def _regions(self, word):
        """Calculate and return start positions for R1 and R2."""
        end = len(word)

        # R1 starts after the first VC...
        match = self._vc.search(word)
        if match: r1 = match.end()
        else: r1 = end

        # ...with these exceptions, where R1 starts after the given prefix.
        for exception in ("gener", "commun"):
            if word.startswith(exception):
                r1 = len(exception)

        # R2 starts after the first VC in R1
        match = self._vc.search(word[r1:])
        if match: r2 = r1 + match.end()
        else: r2 = len(word)

        return r1, r2
	#!/usr/bin/env python
	# -- coding: utf8 --
	"""Stemmers for Swedish and English

	Implements the Swedish stemming algorithm used in snowball:
	<http://snowball.tartarus.org/algorithms/swedish/stemmer.html>

	Implements the English (Porter2) stemming algorithm used in snowball:
	<http://snowball.tartarus.org/algorithms/english/stemmer.html>
	"""

	__author__ = "Filip Salomonsson <filip.salomonsson@gmail.com>"

	import codecs
	import re
	import os

	DEBUG = int(os.environ.get("DEBUG", 0))
	def _debug(s, prefix=""):
	if DEBUG: print >> sys.stderr, prefix, s

	def _show(word, r1=None, r2=None):
	if r1 is None: r1 = len(word)
	if r2 is None: r2 = len(word)
	return "%s[%s[%s]]" % (word[:r1], word[r1:r2], word[r2:])


	class SwedishStemmer(object):
	# Compile regular expression objects for s-ending and suffixes
	_s_ending = "bcdfghjklmnoprtvy"
	_s_ending = re.compile(ur"(%s)s$" % u"\|".join(_s_ending))

	_suffixes = ['heterna', 'hetens', 'anden', 'heten', 'heter', 'arnas',
	'ernas', 'ornas', 'andes', 'arens', 'andet', 'arna',
	'erna', 'orna', 'ande', 'arne', 'aste', 'aren', 'ades',
	'erns', 'ade', 'are', 'ern', 'ens', 'het', 'ast', 'ad',
	'en', 'ar', 'er', 'or', 'as', 'es', 'at', 'a', 'e']
	_suffixes = re.compile(ur"(%s)$" % u"\|".join(_suffixes))

	def _base_and_region(self, word):
	"""Split a (unicode) string into base and region. region corresponds
	to R1 in the snowball algorithm description, and base is the part
	preceding it."""

	match = re.search(ur'^(.?[aeiouyåäö][^aeiouyåäö])(.)$', word)
	if match:
	# R1 is the region after the first non-vowel following a vowel[...]
	base, region = match.groups()

	#(R1 is adjusted so that the region before it contains at least
	# 3 letters.)"
	if len(base) < 3 and len(region) > 0:
	cut_pos = 3 - len(base)
	base = base + region[:cut_pos]
	region = region[cut_pos:]
	return base, region

	else:
	# [...] or is the null region at the end of the word if there
	# is no such non-vowel.
	return word, ''

	def stem(self, word):
	"""Perform stemming a word (as a unicode string) and
	return the stem."""

	base, region = self._base_and_region(word)

	# Step 1:
	# If suffix found in region, delete it...
	region, n = re.subn(self._suffixes, "", region)

	# ..else delete "s" if s-ending in word(!)
	# NB: this searches in entire word, not just in region!
	if n == 0 and re.search(self._s_ending, word) is not None:
	region = region[:-1] # ta bort s:et

	# Step 2:
	# If suffix matches in region, delete the last letter.
	def strip_last_char(match):
	return match.group(0)[:-1]
	region = re.sub(ur"([dg]d\|nn\|[dgkt]t)$", strip_last_char, region)

	# Step 3:
	# If -fullt och -löst in region, delete trailing t...
	region, n = re.subn(ur"(full\|lös)t$", r"\1", region)
	# ...else, if region ends with -lig,-ig or -els, delete that suffix.
	if n == 0:
	region, n = re.subn(ur"(l?ig\|els)$", "", region)

	# We're done! Join base and region, and return the result as the stem.
	return base + region



	class EnglishStemmer(object):

	_step_1a_invariants = set(["inning", "outing", "canning", "herring",
	"earring", "proceed", "exceed", "succeed"])

	_step_2_pairs = [('enci', 'ence'), ('anci', 'ance'),
	('abli', 'able'),
	('entli', 'ent'),
	('izer', 'ize'), ('ization', 'ize'),
	('ational', 'ate'), ('ation', 'ate'), ('ator', 'ate'),
	('tional', 'tion'),
	('alism', 'al'), ('aliti', 'al'), ('alli', 'al'),
	('fulness', 'ful'),
	('ousli', 'ous'), ('ousness', 'ous'),
	('iveness', 'ive'), ('iviti', 'ive'),
	('biliti', 'ble'), ('bli', 'ble'),
	('fulli', 'ful'),
	('lessli', 'less'),]

	_step_3_pairs = [("ational", "ate"), ("tional", "tion"), ("alize", "al"),
	("icate", "ic"), ("iciti", "ic"), ("ical", "ic"),
	("ful", ""), ("ness", "")]

	_step_4_suffixes = "al ance ence er ic able ible ant ement ment " \
	"ent ism ate iti ous ive ize".split()
	_step_4_suffixes = re.compile(r"(?:%s)$" % "\|".join(_step_4_suffixes))

	_li_ending = "cdeghkmnrt"
	_doubles = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")

	_patterns = dict(vowel=r"[aeiouy]", nonvowel=r"[^aeiouy]",
	nonvowelwxY=r"[^aeiouywxY]",)

	_vc = re.compile(r"%(vowel)s %(nonvowel)s" % _patterns, re.VERBOSE)
	_short_syllable = re.compile(r"""(
	%(nonvowel)s %(vowel)s %(nonvowelwxY)s # (a)
	\|
	^%(vowel)s %(nonvowel)s # (b)
	)
	$""" % _patterns, re.VERBOSE)

	_short_word = re.compile(r"[^aeiouy]*" + _short_syllable.pattern, re.VERBOSE)



	_exceptions = {"skis": "ski",
	"skies": "sky",
	"dying": "die", "lying": "lie", "tying": "tie",
	"idly": "idl", "gently": "gentl", "singly": "singl",
	"ugly": "ugli","early": "earli", "only": "onli",
	"sky": None, "news": None, "howe": None, "atlas": None,
	"cosmos": None, "bias": None, "andes": None,
	}

	def stem(self, word):
	"""Perform stemming a word (as a unicode string) and
	return the stem. Expects a lowercase word."""

	# If the word has two letters or less, leave it as it is.
	if len(word) <= 2: return word
	if word in self._exceptions: return self._exceptions[word] or word
	# Remove initial apostrophe, if present.
	if word.startswith("'"): word = word[1:]

	# Set initial 'y', or 'y' after a vowel, to 'Y'
	word = re.sub(ur"^y\|(?<=[aeiouy])y", u"Y", word)

	# Establish the regions R1 and R2 (as indexes)
	r1, r2 = self._regions(word)
	_debug(_show(word, r1, r2), "*** Stemming:")


	# Step 0: Remove longest matching suffix
	word = re.sub(ur"'(s'?)?$", "", word)
	_debug(_show(word, r1, r2), "After step 0:")


	# Step 1a: Perform action for the longest matching suffix
	if word.endswith("sses"):
	# replace by 'ss'
	word = word[:-2]
	elif word.endswith("ied") or word.endswith("ies"):
	# replace by 'i' if preceded by more than one letter...
	if len(word) > 4:
	word = word[:-2]
	# ...otherwise by 'ie'
	else:
	word = word[:-1]
	elif word.endswith("us") or word.endswith("ss"):
	# do nothing
	pass
	elif word.endswith("s") and re.search(ur"[aeiouy]", word[:-2]):
	# delete if the preceding part contains a vowel
	# not immediately before the 's'
	word = word[:-1]
	_debug(_show(word, r1, r2), "After step 1a:")

	# Following step 1a, leave the following invariant
	if word in self._step_1a_invariants:
	return word


	# Step 1b: Perform action for the longest matching suffix
	match = re.search(r"eed(ly)?$", word)
	if match:
	start = match.start()
	# replace by 'ee' if in R1
	if start >= r1: word = word[:start] + "ee"
	else:
	match = re.search(r"(ed\|ing)(ly)?$", word)
	if match:
	# delete if preceding part contains a vowel...
	if re.search(r"[aeiouy]", word[:match.start()]):
	word = word[:match.start()]
	# ...and then:
	if word.endswith("at") or word.endswith("bl") \
	or word.endswith("iz") or word.endswith("unxxx"):
	word += u"e"
	elif word[-2:] in self._doubles:
	# split doubles
	word = word[:-1]
	elif self._short_word.match(word):
	word += u"e"
	_debug(_show(word, r1, r2), "After step 1b:")

	# Step 1c:
	word = re.sub(r"(?<=.[^aeiouy])[yY]$", "i", word)
	_debug(_show(word, r1, r2), "After step 1c:")


	# Step 2: For the longest matching suffix...
	suff = word[r1:]
	for find, repl in self._step_2_pairs:
	if word.endswith(find):
	# ...perform action if suffix is in R1
	if suff.endswith(find):
	word = word[:-len(find)] + repl
	break
	else:
	if word[r1-1:].endswith("logi"): word = word[:-1]
	elif suff.endswith("li") and word[-3] in self._li_ending:
	word = word[:-2]
	_debug(_show(word, r1, r2), "2")


	# Step 3: Again, for the longest matching suffix...
	suff = word[r1:]
	for find, repl in self._step_3_pairs:
	if suff.endswith(find):
	word = word[:-len(find)] + repl
	break
	else:
	if word[r2:].endswith("ative"):
	word = word[:-5]
	_debug(_show(word, r1, r2), "After step 3:")


	# Step 4
	match = self._step_4_suffixes.search(word)
	if match:
	start = match.start()
	if start >= r2: word = word[:start]
	else:
	if word[r2:].endswith("ion") and word[-4] in "st":
	word = word[:-3]
	_debug(_show(word, r1, r2), "After step 4:")

	# Step 5:
	if word[r2:].endswith("e") or \
	word[r1:].endswith("e") and not self._short_syllable.search(word[:-1]):
	word = word[:-1]
	elif word[r2-1:].endswith("ll"): word = word[:-1]

	word = word.replace("Y", "y")
	_debug(_show(word, r1, r2), "After step 5:")

	return word

	def _regions(self, word):
	"""Calculate and return start positions for R1 and R2."""
	end = len(word)

	# R1 starts after the first VC...
	match = self._vc.search(word)
	if match: r1 = match.end()
	else: r1 = end

	# ...with these exceptions, where R1 starts after the given prefix.
	for exception in ("gener", "commun"):
	if word.startswith(exception):
	r1 = len(exception)

	# R2 starts after the first VC in R1
	match = self._vc.search(word[r1:])
	if match: r2 = r1 + match.end()
	else: r2 = len(word)

	return r1, r2