automatthew/analyzer.rb

## analyzer.rb
class Analyzer

  def initialize
    @expansions = []
    @transformations = []
    @substitutions = {}
    @tokenizer = lambda { |string| string.split }
  end

  def tokenizer(&proc)
    @tokenizer = proc
  end

  def expansion(cost=0.0, &proc)
    @expansions << [cost, proc]
  end

  def substitution(input, output)
    @substitutions[input] = output
  end

  alias_method :sub, :substitution

  def transformation(&proc)
    @transformations << proc
  end

  def tokenize(string)
    @tokenizer.call(string)
  end

  def process_token(token)
    @transformations.each do |proc|
      token = proc.call(token)
    end
    if out = @substitutions[token]
      token = out
    end
    variants = {}
    @expansions.each do |cost, proc|
      if variant = proc.call(token)
        variants[variant] = cost
      end
    end
    variants.size > 0 ? [token, variants] : token
  end

  def analyze(string)
    tokenize(string).map { |token| process_token(token) }
  end

end

describe "An Analyzer" do

  before do
    @analyzer = Analyzer.new
  end

  it "can take a custom tokenizer" do
    @analyzer.tokenizer { |string| string.split(/\s+/) }
    @analyzer.tokenize("three blind mice").should == %w{three blind mice}

    @analyzer.tokenizer { |string| string.scan(/[\w']+/) }
    @analyzer.tokenize("joe's bait-shop").should == %w{joe's bait shop}
  end

  it "can perform weighted term expansions" do
    @analyzer.expansion(0.5) { |word| word.tr( "'", "")  if word =~ /'/ }
    @analyzer.expansion(0.5) { |word| word.chomp("'s")   if word =~ /'s$/ }

    @analyzer.process_token("joe's").should == ["joe's", {"joe" => 0.5, "joes" => 0.5}]
    @analyzer.process_token("boring").should == "boring"
  end

  it "can transform terms" do
    @analyzer.transformation { |word| word.reverse }
    @analyzer.process_token("123").should == "321"
  end

  it "can substitute terms" do
    @analyzer.substitution("&", "and")
    @analyzer.process_token("&").should == "and"
  end

  it "expands terms after substitutions" do
    @analyzer.expansion { |word| "ampersand" if word == "and" }
    @analyzer.substitution("&", "and")

    @analyzer.process_token("&").should == ["and", {"ampersand" => 0.0}]
  end

  it "substitutes after transformations" do
    @analyzer.substitution("joe", "joseph")
    @analyzer.transformation { |word| word.tr('m', 'j') }

    @analyzer.process_token("moe").should == "joseph"
  end

  it "does phrases, if you know how to Enumerable#map" do
    @analyzer.sub("&", "and")
    @analyzer.expansion(0.5) { |word| word.tr( "'", "")  if word =~ /'/ }
    @analyzer.expansion(0.5) { |word| word.chomp("'s")   if word =~ /'s$/ }
    @analyzer.expansion(3.0) { |word| word.split('-')   if word =~ /-/ }
    @analyzer.expansion(0.1) { |word| word.tr('-', '')   if word =~ /-/ }

    orig = "joe's sushi & bait-shop shack"
    analyzed = [
      ["joe's", {"joe" => 0.5, "joes" => 0.5}],
      "sushi",
      "and",
      ["bait-shop", {"baitshop" => 0.1, ["bait", "shop"] => 3.0}],
      "shack"
    ]
    @analyzer.analyze(orig).should == analyzed
  end

end
	class Analyzer

	def initialize
	@expansions = []
	@transformations = []
	@substitutions = {}
	@tokenizer = lambda { \|string\| string.split }
	end

	def tokenizer(&proc)
	@tokenizer = proc
	end

	def expansion(cost=0.0, &proc)
	@expansions << [cost, proc]
	end

	def substitution(input, output)
	@substitutions[input] = output
	end

	alias_method :sub, :substitution

	def transformation(&proc)
	@transformations << proc
	end

	def tokenize(string)
	@tokenizer.call(string)
	end

	def process_token(token)
	@transformations.each do \|proc\|
	token = proc.call(token)
	end
	if out = @substitutions[token]
	token = out
	end
	variants = {}
	@expansions.each do \|cost, proc\|
	if variant = proc.call(token)
	variants[variant] = cost
	end
	end
	variants.size > 0 ? [token, variants] : token
	end

	def analyze(string)
	tokenize(string).map { \|token\| process_token(token) }
	end

	end

	describe "An Analyzer" do

	before do
	@analyzer = Analyzer.new
	end

	it "can take a custom tokenizer" do
	@analyzer.tokenizer { \|string\| string.split(/\s+/) }
	@analyzer.tokenize("three blind mice").should == %w{three blind mice}

	@analyzer.tokenizer { \|string\| string.scan(/[\w']+/) }
	@analyzer.tokenize("joe's bait-shop").should == %w{joe's bait shop}
	end

	it "can perform weighted term expansions" do
	@analyzer.expansion(0.5) { \|word\| word.tr( "'", "") if word =~ /'/ }
	@analyzer.expansion(0.5) { \|word\| word.chomp("'s") if word =~ /'s$/ }

	@analyzer.process_token("joe's").should == ["joe's", {"joe" => 0.5, "joes" => 0.5}]
	@analyzer.process_token("boring").should == "boring"
	end

	it "can transform terms" do
	@analyzer.transformation { \|word\| word.reverse }
	@analyzer.process_token("123").should == "321"
	end

	it "can substitute terms" do
	@analyzer.substitution("&", "and")
	@analyzer.process_token("&").should == "and"
	end

	it "expands terms after substitutions" do
	@analyzer.expansion { \|word\| "ampersand" if word == "and" }
	@analyzer.substitution("&", "and")

	@analyzer.process_token("&").should == ["and", {"ampersand" => 0.0}]
	end

	it "substitutes after transformations" do
	@analyzer.substitution("joe", "joseph")
	@analyzer.transformation { \|word\| word.tr('m', 'j') }

	@analyzer.process_token("moe").should == "joseph"
	end

	it "does phrases, if you know how to Enumerable#map" do
	@analyzer.sub("&", "and")
	@analyzer.expansion(0.5) { \|word\| word.tr( "'", "") if word =~ /'/ }
	@analyzer.expansion(0.5) { \|word\| word.chomp("'s") if word =~ /'s$/ }
	@analyzer.expansion(3.0) { \|word\| word.split('-') if word =~ /-/ }
	@analyzer.expansion(0.1) { \|word\| word.tr('-', '') if word =~ /-/ }

	orig = "joe's sushi & bait-shop shack"
	analyzed = [
	["joe's", {"joe" => 0.5, "joes" => 0.5}],
	"sushi",
	"and",
	["bait-shop", {"baitshop" => 0.1, ["bait", "shop"] => 3.0}],
	"shack"
	]
	@analyzer.analyze(orig).should == analyzed
	end

	end