Skip to content

Instantly share code, notes, and snippets.

@automatthew
Created July 23, 2009 20:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save automatthew/153578 to your computer and use it in GitHub Desktop.
Save automatthew/153578 to your computer and use it in GitHub Desktop.
token analysis and expansion
class Analyzer
def initialize
@expansions = []
@transformations = []
@substitutions = {}
@tokenizer = lambda { |string| string.split }
end
def tokenizer(&proc)
@tokenizer = proc
end
def expansion(cost=0.0, &proc)
@expansions << [cost, proc]
end
def substitution(input, output)
@substitutions[input] = output
end
alias_method :sub, :substitution
def transformation(&proc)
@transformations << proc
end
def tokenize(string)
@tokenizer.call(string)
end
def process_token(token)
@transformations.each do |proc|
token = proc.call(token)
end
if out = @substitutions[token]
token = out
end
variants = {}
@expansions.each do |cost, proc|
if variant = proc.call(token)
variants[variant] = cost
end
end
variants.size > 0 ? [token, variants] : token
end
def analyze(string)
tokenize(string).map { |token| process_token(token) }
end
end
describe "An Analyzer" do
before do
@analyzer = Analyzer.new
end
it "can take a custom tokenizer" do
@analyzer.tokenizer { |string| string.split(/\s+/) }
@analyzer.tokenize("three blind mice").should == %w{three blind mice}
@analyzer.tokenizer { |string| string.scan(/[\w']+/) }
@analyzer.tokenize("joe's bait-shop").should == %w{joe's bait shop}
end
it "can perform weighted term expansions" do
@analyzer.expansion(0.5) { |word| word.tr( "'", "") if word =~ /'/ }
@analyzer.expansion(0.5) { |word| word.chomp("'s") if word =~ /'s$/ }
@analyzer.process_token("joe's").should == ["joe's", {"joe" => 0.5, "joes" => 0.5}]
@analyzer.process_token("boring").should == "boring"
end
it "can transform terms" do
@analyzer.transformation { |word| word.reverse }
@analyzer.process_token("123").should == "321"
end
it "can substitute terms" do
@analyzer.substitution("&", "and")
@analyzer.process_token("&").should == "and"
end
it "expands terms after substitutions" do
@analyzer.expansion { |word| "ampersand" if word == "and" }
@analyzer.substitution("&", "and")
@analyzer.process_token("&").should == ["and", {"ampersand" => 0.0}]
end
it "substitutes after transformations" do
@analyzer.substitution("joe", "joseph")
@analyzer.transformation { |word| word.tr('m', 'j') }
@analyzer.process_token("moe").should == "joseph"
end
it "does phrases, if you know how to Enumerable#map" do
@analyzer.sub("&", "and")
@analyzer.expansion(0.5) { |word| word.tr( "'", "") if word =~ /'/ }
@analyzer.expansion(0.5) { |word| word.chomp("'s") if word =~ /'s$/ }
@analyzer.expansion(3.0) { |word| word.split('-') if word =~ /-/ }
@analyzer.expansion(0.1) { |word| word.tr('-', '') if word =~ /-/ }
orig = "joe's sushi & bait-shop shack"
analyzed = [
["joe's", {"joe" => 0.5, "joes" => 0.5}],
"sushi",
"and",
["bait-shop", {"baitshop" => 0.1, ["bait", "shop"] => 3.0}],
"shack"
]
@analyzer.analyze(orig).should == analyzed
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment