Created
December 19, 2011 22:13
-
-
Save athoune/1499150 to your computer and use it in GitHub Desktop.
Playing with tire analyzer for french language
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding: utf-8 | |
require 'rubygems' | |
require 'tire' | |
# | |
# Playing with analyzers and its parameters. | |
# Witch fits best for the french language? | |
# | |
# $ rvm use 1.9.2 | |
# $ gem install tire | |
# $ ruby test.rb | |
# | |
class Hash #compatibility hacks for rails stuffs | |
def to_param | |
self.map{|k,v| "#{k}=#{v}"}.join('&') | |
end | |
end | |
tire = Tire::Index.new 'analyzers' | |
tire.delete | |
conf = { | |
settings: { | |
analysis: { | |
analyzer: { | |
francais: { | |
type: 'custom', | |
tokenizer: 'standard', | |
filter: %w{lowercase stop_francais asciifolding elision} | |
}, | |
francais_boule: { | |
type: 'snowball', | |
language: 'French' | |
} | |
}, | |
filter: { | |
stop_francais: { | |
type: 'stop', | |
stopwords: %w{je tu il nous vous ils le la les un une des a ai et est ayons ça} | |
} | |
} | |
} | |
} | |
} | |
tire.create conf | |
%w{ whitespace standard simple french francais francais_boule}.each do |analyzer| | |
p analyzer | |
a = tire.analyze("Je mange des carottes, des petits pois et des pèches (bien que ça coule sur les doigts). J'aime ça.", :analyzer => analyzer) | |
print " " | |
p a['tokens'].map{ |t| t['token']} | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment