token analysis and expansion
class Analyzer | |
def initialize | |
@expansions = [] | |
@transformations = [] | |
@substitutions = {} | |
@tokenizer = lambda { |string| string.split } | |
end | |
def tokenizer(&proc) | |
@tokenizer = proc | |
end | |
def expansion(cost=0.0, &proc) | |
@expansions << [cost, proc] | |
end | |
def substitution(input, output) | |
@substitutions[input] = output | |
end | |
alias_method :sub, :substitution | |
def transformation(&proc) | |
@transformations << proc | |
end | |
def tokenize(string) | |
@tokenizer.call(string) | |
end | |
def process_token(token) | |
@transformations.each do |proc| | |
token = proc.call(token) | |
end | |
if out = @substitutions[token] | |
token = out | |
end | |
variants = {} | |
@expansions.each do |cost, proc| | |
if variant = proc.call(token) | |
variants[variant] = cost | |
end | |
end | |
variants.size > 0 ? [token, variants] : token | |
end | |
def analyze(string) | |
tokenize(string).map { |token| process_token(token) } | |
end | |
end | |
describe "An Analyzer" do | |
before do | |
@analyzer = Analyzer.new | |
end | |
it "can take a custom tokenizer" do | |
@analyzer.tokenizer { |string| string.split(/\s+/) } | |
@analyzer.tokenize("three blind mice").should == %w{three blind mice} | |
@analyzer.tokenizer { |string| string.scan(/[\w']+/) } | |
@analyzer.tokenize("joe's bait-shop").should == %w{joe's bait shop} | |
end | |
it "can perform weighted term expansions" do | |
@analyzer.expansion(0.5) { |word| word.tr( "'", "") if word =~ /'/ } | |
@analyzer.expansion(0.5) { |word| word.chomp("'s") if word =~ /'s$/ } | |
@analyzer.process_token("joe's").should == ["joe's", {"joe" => 0.5, "joes" => 0.5}] | |
@analyzer.process_token("boring").should == "boring" | |
end | |
it "can transform terms" do | |
@analyzer.transformation { |word| word.reverse } | |
@analyzer.process_token("123").should == "321" | |
end | |
it "can substitute terms" do | |
@analyzer.substitution("&", "and") | |
@analyzer.process_token("&").should == "and" | |
end | |
it "expands terms after substitutions" do | |
@analyzer.expansion { |word| "ampersand" if word == "and" } | |
@analyzer.substitution("&", "and") | |
@analyzer.process_token("&").should == ["and", {"ampersand" => 0.0}] | |
end | |
it "substitutes after transformations" do | |
@analyzer.substitution("joe", "joseph") | |
@analyzer.transformation { |word| word.tr('m', 'j') } | |
@analyzer.process_token("moe").should == "joseph" | |
end | |
it "does phrases, if you know how to Enumerable#map" do | |
@analyzer.sub("&", "and") | |
@analyzer.expansion(0.5) { |word| word.tr( "'", "") if word =~ /'/ } | |
@analyzer.expansion(0.5) { |word| word.chomp("'s") if word =~ /'s$/ } | |
@analyzer.expansion(3.0) { |word| word.split('-') if word =~ /-/ } | |
@analyzer.expansion(0.1) { |word| word.tr('-', '') if word =~ /-/ } | |
orig = "joe's sushi & bait-shop shack" | |
analyzed = [ | |
["joe's", {"joe" => 0.5, "joes" => 0.5}], | |
"sushi", | |
"and", | |
["bait-shop", {"baitshop" => 0.1, ["bait", "shop"] => 3.0}], | |
"shack" | |
] | |
@analyzer.analyze(orig).should == analyzed | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment