YuukanOO/french_stemmer.rb

## french_stemmer.rb
# -*- encoding: utf-8 -*-
#
# Implementation of the stemming algorithm at http://snowball.tartarus.org/algorithms/french/stemmer.html
# Based on the javascript port made by Kasun Gajasinghe http://snowball.tartarus.org/otherlangs/french_javascript.txt
#
# Testing:
#   It uses the file voc.txt (http://snowball.tartarus.org/algorithms/french/voc.txt)
#   and compares results with output.txt (http://snowball.tartarus.org/algorithms/french/output.txt)
#
# At the time being, it fails for 242 words on 20403, feel free to edit this gist.

def stem(word)
    #    Letters in French include the following accented forms,
    #        â   à   ç   ë   é   ê   è   ï   î   ô   û   ù
    #    The following letters are vowels:
    #        a   e   i   o   u   y   â   à   ë   é   ê   è   ï   î   ô   û   ù

    original_word = word

    # Downcase it
    word = word.downcase
    tmp = -1

    # Uppercase some part to exclude them later on
    word.gsub!(/qu/, 'qU')
    word.gsub!(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/, '\1U\2')
    word.gsub!(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/, '\1I\2')
    word.gsub!(/([aeiouyâàëéêèïîôûù])y/, '\1Y')
    word.gsub!(/y([aeiouyâàëéêèïîôûù])/, 'Y\1')

    # Determine RV
    rv = '';
    rv_index = -1;

    if word =~ /^(par|col|tap)/ || word =~ /^[aeiouyâàëéêèïîôûù]{2}/
        rv = word[3..word.length]
        rv_index = 3
    else
        rv_index = (word[1..word.length]) =~ /[aeiouyâàëéêèïîôûù]/
        if rv_index
            rv_index += 2
            rv = word[rv_index..word.length]
        else
            rv_index = word.length
        end
    end

    # R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
    # R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel
    r1_index = word =~ /[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/
    r1 = ''
    if r1_index
        r1_index += 2
        r1 = word[r1_index..word.length]
    else
        r1_index = word.length
    end

    r2_index = -1
    r2 = ''
    if r1_index
        r2_index = r1 =~ /[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/
        if r2_index
            r2_index += 2
            r2 = r1[r2_index..r1.length]
            r2_index += r1_index
        else
            r2 = ''
            r2_index = word.length
        end
    end
    if r1_index && r1_index < 3
        r1_index = 3
        r1 = word[r1_index..word.length]
    end

    # Step 1: Standard suffix removal

    a1_index = word =~ /(ance|iqUe|isme|able|iste|eux|ances|iqUes|ismes|ables|istes)$/
    a2_index = word =~ /(atrice|ateur|ation|atrices|ateurs|ations)$/
    a3_index = word =~ /(logie|logies)$/
    a4_index = word =~ /(usion|ution|usions|utions)$/
    a5_index = word =~ /(ence|ences)$/
    a6_index = word =~ /(ement|ements)$/
    a7_index = word =~ /(ité|ités)$/
    a8_index = word =~ /(if|ive|ifs|ives)$/
    a9_index = word =~ /(eaux)$/
    a10_index = word =~ /(aux)$/
    a11_index = word =~ /(euse|euses)$/
    a12_index = word =~ /[^aeiouyâàëéêèïîôûù](issement|issements)$/
    a13_index = word =~ /(amment)$/
    a14_index = word =~ /(emment)$/
    a15_index = word =~ /[aeiouyâàëéêèïîôûù](ment|ments)$/

    if a1_index && a1_index >= r2_index
        word = word[0..a1_index - 1]
    elsif a2_index && a2_index >= r2_index
        word = word[0..a2_index - 1]
        a2_index2 = word =~ /(ic)$/
        if a2_index2 && a2_index2 >= r2_index
            word = word[0..a2_index2 - 1]
        else
            word.gsub!(/(ic)$/, 'iqU')
        end
    elsif a3_index && a3_index >= r2_index
        word.gsub!(/(logie|logies)$/, 'log')
    elsif a4_index && a4_index >= r2_index
        word.gsub!(/(usion|ution|usions|utions)$/, 'u')
    elsif a5_index && a5_index >= r2_index
        word.gsub!(/(ence|ences)$/, 'ent')
    elsif a6_index && a6_index >= rv_index
        word = word[0..a6_index - 1]
        tmp = word =~ /(iv)$/
        if !tmp.nil? && tmp >= r2_index
            word.gsub!(/(iv)$/, '')
            tmp = word =~ /(at)$/
            if !tmp.nil? && tmp >= r2_index
                word.gsub!(/(at)$/, '')
            end
        elsif word =~ /(eus)$/
            a6_index2 = word =~ /(eus)$/
            if a6_index2 >= r2_index
                word = word[0..a6_index2 - 1]
            elsif a6_index2 >= r1_index
                word = word[0..a6_index2 - 1] + 'eux';
            end
        elsif !(tmp = (word =~ /(abl|iqU)$/)).nil? && tmp >= r2_index
            word.gsub!(/(abl|iqU)$/, '')
        elsif !(tmp = (word =~ /(ièr|Ièr)$/)).nil? && tmp >= rv_index
            word.gsub!(/(ièr|Ièr)$/, 'i')
        end
    elsif a7_index && a7_index >= r2_index
        word = word[0..a7_index - 1]
        if word =~ /(abil)$/
            a7_index2 = word =~ /(abil)$/
            if a7_index2 >= r2_index
                word = word[0..a7_index2 - 1]
            else
                word = word[0..a7_index2 - 1] + 'abl'
            end
        elsif word =~ /(ic)$/
            a7_index3 = word =~ /(ic)$/
            if a7_index3 && a7_index3 >= r2_index
                word = word[0..a7_index3 - 1]
            else
                word.gsub!(/(ic)$/, 'iqU')
            end
        elsif !(tmp = (word =~ /(iv)$/)).nil? && tmp != r2_index
            word.gsub!(/(iv)$/, '')
        end
    elsif a8_index && a8_index >= r2_index
        word = word[0..a8_index - 1]
        tmp = word =~ /(at)$/
        if !tmp.nil? && tmp >= r2_index
            word.gsub!(/(at)$/, '')
            tmp = word =~ /(ic)$/
            if !tmp.nil? && tmp >= r2_index
                word.gsub!(/(ic)$/, '')
            else
                word.gsub!(/(ic)$/, 'iqU')
            end
        end
    elsif a9_index
        word.gsub!(/(eaux)/, 'eau')
    elsif a10_index && a10_index >= r1_index
        word.gsub!(/(aux)/, 'al')
    elsif a11_index
        a11_index2 = word =~ /(euse|euses)$/
        if a11_index2 >= r2_index
            word = word[0..a11_index2 - 1]
        elsif a11_index2 >= r1_index
            word = word[0..a11_index2 - 1] + 'eux'
        end
    elsif a12_index && a12_index >= r1_index
        word = word[0..a12_index]
    elsif a13_index && a13_index >= rv_index
        word.gsub!(/(amment)$/, 'ant')
    elsif a14_index && a14_index >= rv_index
        word.gsub!(/(emment)$/, 'ent')
    elsif a15_index && a15_index >= rv_index
        word = word[0..a15_index]
    end

    # Step 2a: Verb suffixes beginning i

    word_step1 = word.clone
    step_2a_done = false
    if original_word == word.downcase || original_word =~ /(amment|emment|ment|ments)$/
        step_2a_done = true
        b1_regex = /([^aeiouyâàëéêèïîôûù])(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)$/i
        tmp = word =~ b1_regex
        if !tmp.nil? && tmp >= rv_index
            word.gsub!(b1_regex, '\1')
        end
    end

    # Step 2b: Other verb suffixes
    if step_2a_done && word_step1 == word
        b2_regex = /(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|erons|eront|ez|iez)$/i
        tmp = word =~ b2_regex
        if tmp && tmp >= rv_index
            word.gsub!(b2_regex, '')
        else
            tmp = word =~ /(ions)$/
            if tmp && tmp >= r2_index
                word.gsub!(/(ions)$/, '')
            else
                b3_regex = /e(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i
                tmp = word =~ b3_regex
                if tmp && tmp >= rv_index
                    word.gsub!(b3_regex, '')
                else
                    b3_regex2 = /(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i
                    tmp = word =~ b3_regex2
                    if tmp && tmp >= rv_index
                        word.gsub!(b3_regex2, '')
                    end
                end
            end
        end
    end

    if original_word != word.downcase
        # Step 3
        rep = ''
        if word =~ /Y$/
            word.gsub!(/Y$/, 'i')
        elsif word =~ /ç$/
            word.gsub!(/ç$/, 'c')
        end
    else
        # Step 4
        # If the word ends s, not preceded by a, i, o, u, è or s, delete it
        tmp = word =~ /([^aiouès])s$/
        if tmp && tmp >= rv_index
            word.gsub!(/([^aiouès])s$/, '\1')
        end
        e1_index = word =~ /ion$/
        tmp = word =~ /[st]ion$/
        if e1_index && e1_index >= r2_index && tmp && tmp >= rv_index
            word = word[0..e1_index - 1]
        else
            e2_index = word =~ /(ier|ière|Ier|Ière)$/
            if e2_index && e2_index >= rv_index
                word = word[0..e2_index - 1] + 'i'
            else
                tmp = word =~ /e$/
                if tmp && tmp >= rv_index
                    word.gsub!(/e$/, '')
                elsif !(tmp = (word =~ /guë$/)).nil? && tmp >= rv_index
                    word.gsub!(/guë$/, 'gu')
                end
            end
        end
    end

    # Step 5: Undouble
    word.gsub!(/(en|on)(n)$/, '\1')
    word.gsub!(/(ett)$/, 'et')
    word.gsub!(/(el|eil)(l)$/, '\1')

    # Step 6: Un-accent
    word.gsub!(/[éè]([^aeiouyâàëéêèïîôûù]+)$/, 'e\1')
    word.downcase.strip

end

# TESTS
# Opens voc.txt and compare the stem result with output.txt

voc = File.open('voc.txt', 'r:UTF-8')
expected = File.open('output.txt', 'r:UTF-8')
expected_lines = expected.lines.to_a

errors = 0

voc.lines.each_with_index do |l, i|
    stemmed = stem(l)
    expected = expected_lines[i].strip
    if stemmed != expected
        puts "Error: #{l} expected: #{expected} actual: #{stemmed}"
        errors += 1
    end
end

puts "#{errors} error(s) found, tested #{expected_lines.length} words/stems"
	# -- encoding: utf-8 --
	#
	# Implementation of the stemming algorithm at http://snowball.tartarus.org/algorithms/french/stemmer.html
	# Based on the javascript port made by Kasun Gajasinghe http://snowball.tartarus.org/otherlangs/french_javascript.txt
	#
	# Testing:
	# It uses the file voc.txt (http://snowball.tartarus.org/algorithms/french/voc.txt)
	# and compares results with output.txt (http://snowball.tartarus.org/algorithms/french/output.txt)
	#
	# At the time being, it fails for 242 words on 20403, feel free to edit this gist.

	def stem(word)
	# Letters in French include the following accented forms,
	# â à ç ë é ê è ï î ô û ù
	# The following letters are vowels:
	# a e i o u y â à ë é ê è ï î ô û ù

	original_word = word

	# Downcase it
	word = word.downcase
	tmp = -1

	# Uppercase some part to exclude them later on
	word.gsub!(/qu/, 'qU')
	word.gsub!(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/, '\1U\2')
	word.gsub!(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/, '\1I\2')
	word.gsub!(/([aeiouyâàëéêèïîôûù])y/, '\1Y')
	word.gsub!(/y([aeiouyâàëéêèïîôûù])/, 'Y\1')

	# Determine RV
	rv = '';
	rv_index = -1;

	if word =~ /^(par\|col\|tap)/ \|\| word =~ /^[aeiouyâàëéêèïîôûù]{2}/
	rv = word[3..word.length]
	rv_index = 3
	else
	rv_index = (word[1..word.length]) =~ /[aeiouyâàëéêèïîôûù]/
	if rv_index
	rv_index += 2
	rv = word[rv_index..word.length]
	else
	rv_index = word.length
	end
	end

	# R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
	# R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel
	r1_index = word =~ /[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/
	r1 = ''
	if r1_index
	r1_index += 2
	r1 = word[r1_index..word.length]
	else
	r1_index = word.length
	end

	r2_index = -1
	r2 = ''
	if r1_index
	r2_index = r1 =~ /[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/
	if r2_index
	r2_index += 2
	r2 = r1[r2_index..r1.length]
	r2_index += r1_index
	else
	r2 = ''
	r2_index = word.length
	end
	end
	if r1_index && r1_index < 3
	r1_index = 3
	r1 = word[r1_index..word.length]
	end

	# Step 1: Standard suffix removal

	a1_index = word =~ /(ance\|iqUe\|isme\|able\|iste\|eux\|ances\|iqUes\|ismes\|ables\|istes)$/
	a2_index = word =~ /(atrice\|ateur\|ation\|atrices\|ateurs\|ations)$/
	a3_index = word =~ /(logie\|logies)$/
	a4_index = word =~ /(usion\|ution\|usions\|utions)$/
	a5_index = word =~ /(ence\|ences)$/
	a6_index = word =~ /(ement\|ements)$/
	a7_index = word =~ /(ité\|ités)$/
	a8_index = word =~ /(if\|ive\|ifs\|ives)$/
	a9_index = word =~ /(eaux)$/
	a10_index = word =~ /(aux)$/
	a11_index = word =~ /(euse\|euses)$/
	a12_index = word =~ /[^aeiouyâàëéêèïîôûù](issement\|issements)$/
	a13_index = word =~ /(amment)$/
	a14_index = word =~ /(emment)$/
	a15_index = word =~ /[aeiouyâàëéêèïîôûù](ment\|ments)$/

	if a1_index && a1_index >= r2_index
	word = word[0..a1_index - 1]
	elsif a2_index && a2_index >= r2_index
	word = word[0..a2_index - 1]
	a2_index2 = word =~ /(ic)$/
	if a2_index2 && a2_index2 >= r2_index
	word = word[0..a2_index2 - 1]
	else
	word.gsub!(/(ic)$/, 'iqU')
	end
	elsif a3_index && a3_index >= r2_index
	word.gsub!(/(logie\|logies)$/, 'log')
	elsif a4_index && a4_index >= r2_index
	word.gsub!(/(usion\|ution\|usions\|utions)$/, 'u')
	elsif a5_index && a5_index >= r2_index
	word.gsub!(/(ence\|ences)$/, 'ent')
	elsif a6_index && a6_index >= rv_index
	word = word[0..a6_index - 1]
	tmp = word =~ /(iv)$/
	if !tmp.nil? && tmp >= r2_index
	word.gsub!(/(iv)$/, '')
	tmp = word =~ /(at)$/
	if !tmp.nil? && tmp >= r2_index
	word.gsub!(/(at)$/, '')
	end
	elsif word =~ /(eus)$/
	a6_index2 = word =~ /(eus)$/
	if a6_index2 >= r2_index
	word = word[0..a6_index2 - 1]
	elsif a6_index2 >= r1_index
	word = word[0..a6_index2 - 1] + 'eux';
	end
	elsif !(tmp = (word =~ /(abl\|iqU)$/)).nil? && tmp >= r2_index
	word.gsub!(/(abl\|iqU)$/, '')
	elsif !(tmp = (word =~ /(ièr\|Ièr)$/)).nil? && tmp >= rv_index
	word.gsub!(/(ièr\|Ièr)$/, 'i')
	end
	elsif a7_index && a7_index >= r2_index
	word = word[0..a7_index - 1]
	if word =~ /(abil)$/
	a7_index2 = word =~ /(abil)$/
	if a7_index2 >= r2_index
	word = word[0..a7_index2 - 1]
	else
	word = word[0..a7_index2 - 1] + 'abl'
	end
	elsif word =~ /(ic)$/
	a7_index3 = word =~ /(ic)$/
	if a7_index3 && a7_index3 >= r2_index
	word = word[0..a7_index3 - 1]
	else
	word.gsub!(/(ic)$/, 'iqU')
	end
	elsif !(tmp = (word =~ /(iv)$/)).nil? && tmp != r2_index
	word.gsub!(/(iv)$/, '')
	end
	elsif a8_index && a8_index >= r2_index
	word = word[0..a8_index - 1]
	tmp = word =~ /(at)$/
	if !tmp.nil? && tmp >= r2_index
	word.gsub!(/(at)$/, '')
	tmp = word =~ /(ic)$/
	if !tmp.nil? && tmp >= r2_index
	word.gsub!(/(ic)$/, '')
	else
	word.gsub!(/(ic)$/, 'iqU')
	end
	end
	elsif a9_index
	word.gsub!(/(eaux)/, 'eau')
	elsif a10_index && a10_index >= r1_index
	word.gsub!(/(aux)/, 'al')
	elsif a11_index
	a11_index2 = word =~ /(euse\|euses)$/
	if a11_index2 >= r2_index
	word = word[0..a11_index2 - 1]
	elsif a11_index2 >= r1_index
	word = word[0..a11_index2 - 1] + 'eux'
	end
	elsif a12_index && a12_index >= r1_index
	word = word[0..a12_index]
	elsif a13_index && a13_index >= rv_index
	word.gsub!(/(amment)$/, 'ant')
	elsif a14_index && a14_index >= rv_index
	word.gsub!(/(emment)$/, 'ent')
	elsif a15_index && a15_index >= rv_index
	word = word[0..a15_index]
	end

	# Step 2a: Verb suffixes beginning i

	word_step1 = word.clone
	step_2a_done = false
	if original_word == word.downcase \|\| original_word =~ /(amment\|emment\|ment\|ments)$/
	step_2a_done = true
	b1_regex = /([^aeiouyâàëéêèïîôûù])(îmes\|ît\|îtes\|i\|ie\|ies\|ir\|ira\|irai\|iraIent\|irais\|irait\|iras\|irent\|irez\|iriez\|irions\|irons\|iront\|is\|issaIent\|issais\|issait\|issant\|issante\|issantes\|issants\|isse\|issent\|isses\|issez\|issiez\|issions\|issons\|it)$/i
	tmp = word =~ b1_regex
	if !tmp.nil? && tmp >= rv_index
	word.gsub!(b1_regex, '\1')
	end
	end

	# Step 2b: Other verb suffixes
	if step_2a_done && word_step1 == word
	b2_regex = /(é\|ée\|ées\|és\|èrent\|er\|era\|erai\|eraIent\|erais\|erait\|eras\|erez\|eriez\|erions\|erons\|eront\|ez\|iez)$/i
	tmp = word =~ b2_regex
	if tmp && tmp >= rv_index
	word.gsub!(b2_regex, '')
	else
	tmp = word =~ /(ions)$/
	if tmp && tmp >= r2_index
	word.gsub!(/(ions)$/, '')
	else
	b3_regex = /e(âmes\|ât\|âtes\|a\|ai\|aIent\|ais\|ait\|ant\|ante\|antes\|ants\|as\|asse\|assent\|asses\|assiez\|assions)$/i
	tmp = word =~ b3_regex
	if tmp && tmp >= rv_index
	word.gsub!(b3_regex, '')
	else
	b3_regex2 = /(âmes\|ât\|âtes\|a\|ai\|aIent\|ais\|ait\|ant\|ante\|antes\|ants\|as\|asse\|assent\|asses\|assiez\|assions)$/i
	tmp = word =~ b3_regex2
	if tmp && tmp >= rv_index
	word.gsub!(b3_regex2, '')
	end
	end
	end
	end
	end

	if original_word != word.downcase
	# Step 3
	rep = ''
	if word =~ /Y$/
	word.gsub!(/Y$/, 'i')
	elsif word =~ /ç$/
	word.gsub!(/ç$/, 'c')
	end
	else
	# Step 4
	# If the word ends s, not preceded by a, i, o, u, è or s, delete it
	tmp = word =~ /([^aiouès])s$/
	if tmp && tmp >= rv_index
	word.gsub!(/([^aiouès])s$/, '\1')
	end
	e1_index = word =~ /ion$/
	tmp = word =~ /[st]ion$/
	if e1_index && e1_index >= r2_index && tmp && tmp >= rv_index
	word = word[0..e1_index - 1]
	else
	e2_index = word =~ /(ier\|ière\|Ier\|Ière)$/
	if e2_index && e2_index >= rv_index
	word = word[0..e2_index - 1] + 'i'
	else
	tmp = word =~ /e$/
	if tmp && tmp >= rv_index
	word.gsub!(/e$/, '')
	elsif !(tmp = (word =~ /guë$/)).nil? && tmp >= rv_index
	word.gsub!(/guë$/, 'gu')
	end
	end
	end
	end

	# Step 5: Undouble
	word.gsub!(/(en\|on)(n)$/, '\1')
	word.gsub!(/(ett)$/, 'et')
	word.gsub!(/(el\|eil)(l)$/, '\1')

	# Step 6: Un-accent
	word.gsub!(/[éè]([^aeiouyâàëéêèïîôûù]+)$/, 'e\1')
	word.downcase.strip

	end

	# TESTS
	# Opens voc.txt and compare the stem result with output.txt

	voc = File.open('voc.txt', 'r:UTF-8')
	expected = File.open('output.txt', 'r:UTF-8')
	expected_lines = expected.lines.to_a

	errors = 0

	voc.lines.each_with_index do \|l, i\|
	stemmed = stem(l)
	expected = expected_lines[i].strip
	if stemmed != expected
	puts "Error: #{l} expected: #{expected} actual: #{stemmed}"
	errors += 1
	end
	end

	puts "#{errors} error(s) found, tested #{expected_lines.length} words/stems"