Last active
August 29, 2015 14:02
-
-
Save YuukanOO/cca29b69ba2f4c2e9f54 to your computer and use it in GitHub Desktop.
French stemming algorithm in Ruby
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
# | |
# Implementation of the stemming algorithm at http://snowball.tartarus.org/algorithms/french/stemmer.html | |
# Based on the javascript port made by Kasun Gajasinghe http://snowball.tartarus.org/otherlangs/french_javascript.txt | |
# | |
# Testing: | |
# It uses the file voc.txt (http://snowball.tartarus.org/algorithms/french/voc.txt) | |
# and compares results with output.txt (http://snowball.tartarus.org/algorithms/french/output.txt) | |
# | |
# At the time being, it fails for 242 words on 20403, feel free to edit this gist. | |
def stem(word) | |
# Letters in French include the following accented forms, | |
# â à ç ë é ê è ï î ô û ù | |
# The following letters are vowels: | |
# a e i o u y â à ë é ê è ï î ô û ù | |
original_word = word | |
# Downcase it | |
word = word.downcase | |
tmp = -1 | |
# Uppercase some part to exclude them later on | |
word.gsub!(/qu/, 'qU') | |
word.gsub!(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/, '\1U\2') | |
word.gsub!(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/, '\1I\2') | |
word.gsub!(/([aeiouyâàëéêèïîôûù])y/, '\1Y') | |
word.gsub!(/y([aeiouyâàëéêèïîôûù])/, 'Y\1') | |
# Determine RV | |
rv = ''; | |
rv_index = -1; | |
if word =~ /^(par|col|tap)/ || word =~ /^[aeiouyâàëéêèïîôûù]{2}/ | |
rv = word[3..word.length] | |
rv_index = 3 | |
else | |
rv_index = (word[1..word.length]) =~ /[aeiouyâàëéêèïîôûù]/ | |
if rv_index | |
rv_index += 2 | |
rv = word[rv_index..word.length] | |
else | |
rv_index = word.length | |
end | |
end | |
# R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. | |
# R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel | |
r1_index = word =~ /[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/ | |
r1 = '' | |
if r1_index | |
r1_index += 2 | |
r1 = word[r1_index..word.length] | |
else | |
r1_index = word.length | |
end | |
r2_index = -1 | |
r2 = '' | |
if r1_index | |
r2_index = r1 =~ /[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/ | |
if r2_index | |
r2_index += 2 | |
r2 = r1[r2_index..r1.length] | |
r2_index += r1_index | |
else | |
r2 = '' | |
r2_index = word.length | |
end | |
end | |
if r1_index && r1_index < 3 | |
r1_index = 3 | |
r1 = word[r1_index..word.length] | |
end | |
# Step 1: Standard suffix removal | |
a1_index = word =~ /(ance|iqUe|isme|able|iste|eux|ances|iqUes|ismes|ables|istes)$/ | |
a2_index = word =~ /(atrice|ateur|ation|atrices|ateurs|ations)$/ | |
a3_index = word =~ /(logie|logies)$/ | |
a4_index = word =~ /(usion|ution|usions|utions)$/ | |
a5_index = word =~ /(ence|ences)$/ | |
a6_index = word =~ /(ement|ements)$/ | |
a7_index = word =~ /(ité|ités)$/ | |
a8_index = word =~ /(if|ive|ifs|ives)$/ | |
a9_index = word =~ /(eaux)$/ | |
a10_index = word =~ /(aux)$/ | |
a11_index = word =~ /(euse|euses)$/ | |
a12_index = word =~ /[^aeiouyâàëéêèïîôûù](issement|issements)$/ | |
a13_index = word =~ /(amment)$/ | |
a14_index = word =~ /(emment)$/ | |
a15_index = word =~ /[aeiouyâàëéêèïîôûù](ment|ments)$/ | |
if a1_index && a1_index >= r2_index | |
word = word[0..a1_index - 1] | |
elsif a2_index && a2_index >= r2_index | |
word = word[0..a2_index - 1] | |
a2_index2 = word =~ /(ic)$/ | |
if a2_index2 && a2_index2 >= r2_index | |
word = word[0..a2_index2 - 1] | |
else | |
word.gsub!(/(ic)$/, 'iqU') | |
end | |
elsif a3_index && a3_index >= r2_index | |
word.gsub!(/(logie|logies)$/, 'log') | |
elsif a4_index && a4_index >= r2_index | |
word.gsub!(/(usion|ution|usions|utions)$/, 'u') | |
elsif a5_index && a5_index >= r2_index | |
word.gsub!(/(ence|ences)$/, 'ent') | |
elsif a6_index && a6_index >= rv_index | |
word = word[0..a6_index - 1] | |
tmp = word =~ /(iv)$/ | |
if !tmp.nil? && tmp >= r2_index | |
word.gsub!(/(iv)$/, '') | |
tmp = word =~ /(at)$/ | |
if !tmp.nil? && tmp >= r2_index | |
word.gsub!(/(at)$/, '') | |
end | |
elsif word =~ /(eus)$/ | |
a6_index2 = word =~ /(eus)$/ | |
if a6_index2 >= r2_index | |
word = word[0..a6_index2 - 1] | |
elsif a6_index2 >= r1_index | |
word = word[0..a6_index2 - 1] + 'eux'; | |
end | |
elsif !(tmp = (word =~ /(abl|iqU)$/)).nil? && tmp >= r2_index | |
word.gsub!(/(abl|iqU)$/, '') | |
elsif !(tmp = (word =~ /(ièr|Ièr)$/)).nil? && tmp >= rv_index | |
word.gsub!(/(ièr|Ièr)$/, 'i') | |
end | |
elsif a7_index && a7_index >= r2_index | |
word = word[0..a7_index - 1] | |
if word =~ /(abil)$/ | |
a7_index2 = word =~ /(abil)$/ | |
if a7_index2 >= r2_index | |
word = word[0..a7_index2 - 1] | |
else | |
word = word[0..a7_index2 - 1] + 'abl' | |
end | |
elsif word =~ /(ic)$/ | |
a7_index3 = word =~ /(ic)$/ | |
if a7_index3 && a7_index3 >= r2_index | |
word = word[0..a7_index3 - 1] | |
else | |
word.gsub!(/(ic)$/, 'iqU') | |
end | |
elsif !(tmp = (word =~ /(iv)$/)).nil? && tmp != r2_index | |
word.gsub!(/(iv)$/, '') | |
end | |
elsif a8_index && a8_index >= r2_index | |
word = word[0..a8_index - 1] | |
tmp = word =~ /(at)$/ | |
if !tmp.nil? && tmp >= r2_index | |
word.gsub!(/(at)$/, '') | |
tmp = word =~ /(ic)$/ | |
if !tmp.nil? && tmp >= r2_index | |
word.gsub!(/(ic)$/, '') | |
else | |
word.gsub!(/(ic)$/, 'iqU') | |
end | |
end | |
elsif a9_index | |
word.gsub!(/(eaux)/, 'eau') | |
elsif a10_index && a10_index >= r1_index | |
word.gsub!(/(aux)/, 'al') | |
elsif a11_index | |
a11_index2 = word =~ /(euse|euses)$/ | |
if a11_index2 >= r2_index | |
word = word[0..a11_index2 - 1] | |
elsif a11_index2 >= r1_index | |
word = word[0..a11_index2 - 1] + 'eux' | |
end | |
elsif a12_index && a12_index >= r1_index | |
word = word[0..a12_index] | |
elsif a13_index && a13_index >= rv_index | |
word.gsub!(/(amment)$/, 'ant') | |
elsif a14_index && a14_index >= rv_index | |
word.gsub!(/(emment)$/, 'ent') | |
elsif a15_index && a15_index >= rv_index | |
word = word[0..a15_index] | |
end | |
# Step 2a: Verb suffixes beginning i | |
word_step1 = word.clone | |
step_2a_done = false | |
if original_word == word.downcase || original_word =~ /(amment|emment|ment|ments)$/ | |
step_2a_done = true | |
b1_regex = /([^aeiouyâàëéêèïîôûù])(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)$/i | |
tmp = word =~ b1_regex | |
if !tmp.nil? && tmp >= rv_index | |
word.gsub!(b1_regex, '\1') | |
end | |
end | |
# Step 2b: Other verb suffixes | |
if step_2a_done && word_step1 == word | |
b2_regex = /(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|erons|eront|ez|iez)$/i | |
tmp = word =~ b2_regex | |
if tmp && tmp >= rv_index | |
word.gsub!(b2_regex, '') | |
else | |
tmp = word =~ /(ions)$/ | |
if tmp && tmp >= r2_index | |
word.gsub!(/(ions)$/, '') | |
else | |
b3_regex = /e(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i | |
tmp = word =~ b3_regex | |
if tmp && tmp >= rv_index | |
word.gsub!(b3_regex, '') | |
else | |
b3_regex2 = /(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i | |
tmp = word =~ b3_regex2 | |
if tmp && tmp >= rv_index | |
word.gsub!(b3_regex2, '') | |
end | |
end | |
end | |
end | |
end | |
if original_word != word.downcase | |
# Step 3 | |
rep = '' | |
if word =~ /Y$/ | |
word.gsub!(/Y$/, 'i') | |
elsif word =~ /ç$/ | |
word.gsub!(/ç$/, 'c') | |
end | |
else | |
# Step 4 | |
# If the word ends s, not preceded by a, i, o, u, è or s, delete it | |
tmp = word =~ /([^aiouès])s$/ | |
if tmp && tmp >= rv_index | |
word.gsub!(/([^aiouès])s$/, '\1') | |
end | |
e1_index = word =~ /ion$/ | |
tmp = word =~ /[st]ion$/ | |
if e1_index && e1_index >= r2_index && tmp && tmp >= rv_index | |
word = word[0..e1_index - 1] | |
else | |
e2_index = word =~ /(ier|ière|Ier|Ière)$/ | |
if e2_index && e2_index >= rv_index | |
word = word[0..e2_index - 1] + 'i' | |
else | |
tmp = word =~ /e$/ | |
if tmp && tmp >= rv_index | |
word.gsub!(/e$/, '') | |
elsif !(tmp = (word =~ /guë$/)).nil? && tmp >= rv_index | |
word.gsub!(/guë$/, 'gu') | |
end | |
end | |
end | |
end | |
# Step 5: Undouble | |
word.gsub!(/(en|on)(n)$/, '\1') | |
word.gsub!(/(ett)$/, 'et') | |
word.gsub!(/(el|eil)(l)$/, '\1') | |
# Step 6: Un-accent | |
word.gsub!(/[éè]([^aeiouyâàëéêèïîôûù]+)$/, 'e\1') | |
word.downcase.strip | |
end | |
# TESTS | |
# Opens voc.txt and compare the stem result with output.txt | |
voc = File.open('voc.txt', 'r:UTF-8') | |
expected = File.open('output.txt', 'r:UTF-8') | |
expected_lines = expected.lines.to_a | |
errors = 0 | |
voc.lines.each_with_index do |l, i| | |
stemmed = stem(l) | |
expected = expected_lines[i].strip | |
if stemmed != expected | |
puts "Error: #{l} expected: #{expected} actual: #{stemmed}" | |
errors += 1 | |
end | |
end | |
puts "#{errors} error(s) found, tested #{expected_lines.length} words/stems" |
Replaced gsub by gsub! for simplicity.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Feel free to help and contribute!