oleganza/find_by_token.rb

## find_by_token.rb
# class Person
#   include FindByToken
#   token_search :name, :nickname, :email
# end
#
# Person.find_by_token("oleg andreev", :limit => 100) => [...]
#
module FindByToken

  DEFAULT_LIMIT      = 100
  DEFAULT_MIN_LENGTH = 3
  DEFAULT_MAX_LENGTH = 64

  def self.included(mod)
    mod.extend(Indexer)
    mod.extend(Finder)
  end

  # This is used to configure model class with DSL methods
  module Indexer
    attr_accessor :token_search_model_inclusions, :token_search_model

    # Define what to include into token search model by default:
    #   token_search_inclusions do
    #     include SomeModule
    #     include AnotherModule
    #   end
    def token_search_inclusions(&blk)
      @token_search_model_inclusions = blk
    end

    def token_search(*fields)
      proc_inclusions = @token_search_model_inclusions || F.default_inclusions

      min_length = DEFAULT_MIN_LENGTH

      name        = self.name.to_s
      class_name  = F.class_name(self, fields)
      assocs_name = class_name.snake_case.plural.to_sym

      model = F.create_model(class_name, &proc_inclusions)
      F.create_properties(name, model)

      F.setup_association(assocs_name, self)
      F.setup_hooks(assocs_name, fields, self, model, min_length)

      @token_search_model = model
    end

    module F extend self
      def name_from_fields(fields)
        fields.map{|f| f.to_s }.join("_")
      end

      def class_name(model, fields)
        fields_name = name_from_fields(fields).camel_case
        "TokenSearch#{model.name}#{fields_name}Entry"
      end

      def create_model(class_name, &blk)
        cls = Class.new
        eval("::#{class_name} = cls") # create a top-level const
        cls.module_eval(&blk)
        cls
      end

      def create_properties(class_name, search_model)
        size = DEFAULT_MAX_LENGTH
        search_model.module_eval do
          property   :token, String, :length => size, :index => true
          belongs_to :entity, :class_name => class_name, :child_key => [ :entity_id ], :index => true
          def token=(t)
            attribute_set(:token, t.to_s[0, DEFAULT_MAX_LENGTH])
          end
        end
      end

      def setup_association(assocs_name, model)
        model.module_eval do
          has n, assocs_name
        end
      end

      def setup_hooks(assocs_name, fields, model, search_model, min_length)
        model.after(:save) do
          # attrs = dirty_attributes.keys.map{|p| p.name }
          # p [:after_save, fields, attrs, self]
          # unless (attrs & fields).empty? # indexed tokens updated?
            # 1. Remove all previous tokens
            send(assocs_name).destroy!
            # 2. Add new tokens
            F.add_tokens(self, fields, search_model, min_length)
          # end
        end
      end

      def add_tokens(entity, fields, search_model, min_length)
        fields.map do |field|
          tokenize(entity.send(field).to_s, min_length)
        end.
        flatten.
        uniq.
        each do |token|
          sm = search_model.new
          sm.token  = token
          sm.entity = entity
          sm.save
        end
      end

      ONE_SPACE     = " "
      MANY_SPACES   = " "*16
      def tokenize(text, min_length)
        text = text.gsub(/\s+/u, ONE_SPACE).strip.downcase
        tokens = []
        offset = 0
        while i = text.index(/[@\w\.\-_]/u, offset)
          j = text.index(/[^@\w\.\-_]/u, i)
          l = (j || text.length) - i
          tokens << text[i, l > min_length ? l : min_length].strip
          offset = j or break
        end
        tokens.select{|t| t.size >= min_length }
      end

      def default_inclusions
        proc do
          include ::DataMapper::Resource
          property :id, ::DataMapper::Types::Serial
        end
      end
    end # F - private functions
  end # Indexer

  # This is used to find model instances using a string of tokens
  module Finder

    IF = Indexer::F

    ORDER = [ :token ]

    def find_by_token(query, dm_query = {})
      tokens = IF.tokenize(query, DEFAULT_MIN_LENGTH)
      arr = tokens.inject(nil) do |set, token|
        subset = @token_search_model.all({
            :token.gte => token,
            :token.lt  => token.next,
            :order     => ORDER,
            :limit     => DEFAULT_LIMIT
            }.merge(dm_query))

        subset = subset.map do |relation|
          relation.entity
        end
        set ? set & subset : subset
      end || []
      arr.uniq!
      arr
    end

  end # Finder
end # FindByToken

## find_by_token_spec.rb
require File.join(File.dirname(__FILE__), "spec_helper")

describe FindByToken do

  describe FindByToken::Indexer do

    before(:each) do
    end

    def tokenize_should(a, b, m = 5)
      FindByToken::Indexer::F.tokenize(a, m).should == b
    end

    it "should tokenize single-word strings" do
      tokenize_should("",             %w[], 3)
      tokenize_should("Xy",           %w[], 3)
      tokenize_should("1.2",          %w[1.2], 3)
      tokenize_should("Xyz",          %w[xyz], 3)
      tokenize_should("Oleg",         %w[oleg], 3)
      tokenize_should("1Oleg",        %w[1oleg], 3)
      tokenize_should("OlegAndreev",  %w[olegandreev], 3)
    end

    it "should tokenize double-word strings" do
      tokenize_should(" Oleg - Andreev ",  %w[oleg  -\ a   andreev], 3)
      tokenize_should(" Oleg-Andreev ",  %w[oleg-andreev], 3)
      tokenize_should(" Oleg Andreev ",  %w[oleg andreev], 3)
    end

    it "should tokenize misc. strings" do
      tokenize_should("Leslie Feist - 1 2 3 4", ["leslie", "feist", "- 1 2", "1 2 3", "2 3 4"], 5)
      tokenize_should("I'm C++ programmer",     %w[i'm      m\ c     c++      programmer], 3)
      tokenize_should("I'm C++ programmer",     %w[i'm\ c   m\ c++   c++\ p   programmer], 5)

      tokenize_should("I am C++ programmer",    %w[i\ a     c++      programmer], 3)
      tokenize_should("I am C++ programmer",    %w[am\ c+   c++\ p   programmer], 5)
    end


  end # FindByToken::Indexer

end # FindByToken
	# class Person
	# include FindByToken
	# token_search :name, :nickname, :email
	# end
	#
	# Person.find_by_token("oleg andreev", :limit => 100) => [...]
	#
	module FindByToken

	DEFAULT_LIMIT = 100
	DEFAULT_MIN_LENGTH = 3
	DEFAULT_MAX_LENGTH = 64

	def self.included(mod)
	mod.extend(Indexer)
	mod.extend(Finder)
	end

	# This is used to configure model class with DSL methods
	module Indexer
	attr_accessor :token_search_model_inclusions, :token_search_model

	# Define what to include into token search model by default:
	# token_search_inclusions do
	# include SomeModule
	# include AnotherModule
	# end
	def token_search_inclusions(&blk)
	@token_search_model_inclusions = blk
	end

	def token_search(*fields)
	proc_inclusions = @token_search_model_inclusions \|\| F.default_inclusions

	min_length = DEFAULT_MIN_LENGTH

	name = self.name.to_s
	class_name = F.class_name(self, fields)
	assocs_name = class_name.snake_case.plural.to_sym

	model = F.create_model(class_name, &proc_inclusions)
	F.create_properties(name, model)

	F.setup_association(assocs_name, self)
	F.setup_hooks(assocs_name, fields, self, model, min_length)

	@token_search_model = model
	end

	module F extend self
	def name_from_fields(fields)
	fields.map{\|f\| f.to_s }.join("_")
	end

	def class_name(model, fields)
	fields_name = name_from_fields(fields).camel_case
	"TokenSearch#{model.name}#{fields_name}Entry"
	end

	def create_model(class_name, &blk)
	cls = Class.new
	eval("::#{class_name} = cls") # create a top-level const
	cls.module_eval(&blk)
	cls
	end

	def create_properties(class_name, search_model)
	size = DEFAULT_MAX_LENGTH
	search_model.module_eval do
	property :token, String, :length => size, :index => true
	belongs_to :entity, :class_name => class_name, :child_key => [ :entity_id ], :index => true
	def token=(t)
	attribute_set(:token, t.to_s[0, DEFAULT_MAX_LENGTH])
	end
	end
	end

	def setup_association(assocs_name, model)
	model.module_eval do
	has n, assocs_name
	end
	end

	def setup_hooks(assocs_name, fields, model, search_model, min_length)
	model.after(:save) do
	# attrs = dirty_attributes.keys.map{\|p\| p.name }
	# p [:after_save, fields, attrs, self]
	# unless (attrs & fields).empty? # indexed tokens updated?
	# 1. Remove all previous tokens
	send(assocs_name).destroy!
	# 2. Add new tokens
	F.add_tokens(self, fields, search_model, min_length)
	# end
	end
	end

	def add_tokens(entity, fields, search_model, min_length)
	fields.map do \|field\|
	tokenize(entity.send(field).to_s, min_length)
	end.
	flatten.
	uniq.
	each do \|token\|
	sm = search_model.new
	sm.token = token
	sm.entity = entity
	sm.save
	end
	end

	ONE_SPACE = " "
	MANY_SPACES = " "*16
	def tokenize(text, min_length)
	text = text.gsub(/\s+/u, ONE_SPACE).strip.downcase
	tokens = []
	offset = 0
	while i = text.index(/[@\w\.\-_]/u, offset)
	j = text.index(/[^@\w\.\-_]/u, i)
	l = (j \|\| text.length) - i
	tokens << text[i, l > min_length ? l : min_length].strip
	offset = j or break
	end
	tokens.select{\|t\| t.size >= min_length }
	end

	def default_inclusions
	proc do
	include ::DataMapper::Resource
	property :id, ::DataMapper::Types::Serial
	end
	end
	end # F - private functions
	end # Indexer

	# This is used to find model instances using a string of tokens
	module Finder

	IF = Indexer::F

	ORDER = [ :token ]

	def find_by_token(query, dm_query = {})
	tokens = IF.tokenize(query, DEFAULT_MIN_LENGTH)
	arr = tokens.inject(nil) do \|set, token\|
	subset = @token_search_model.all({
	:token.gte => token,
	:token.lt => token.next,
	:order => ORDER,
	:limit => DEFAULT_LIMIT
	}.merge(dm_query))

	subset = subset.map do \|relation\|
	relation.entity
	end
	set ? set & subset : subset
	end \|\| []
	arr.uniq!
	arr
	end

	end # Finder
	end # FindByToken
	require File.join(File.dirname(__FILE__), "spec_helper")

	describe FindByToken do

	describe FindByToken::Indexer do

	before(:each) do
	end

	def tokenize_should(a, b, m = 5)
	FindByToken::Indexer::F.tokenize(a, m).should == b
	end

	it "should tokenize single-word strings" do
	tokenize_should("", %w[], 3)
	tokenize_should("Xy", %w[], 3)
	tokenize_should("1.2", %w[1.2], 3)
	tokenize_should("Xyz", %w[xyz], 3)
	tokenize_should("Oleg", %w[oleg], 3)
	tokenize_should("1Oleg", %w[1oleg], 3)
	tokenize_should("OlegAndreev", %w[olegandreev], 3)
	end

	it "should tokenize double-word strings" do
	tokenize_should(" Oleg - Andreev ", %w[oleg -\ a andreev], 3)
	tokenize_should(" Oleg-Andreev ", %w[oleg-andreev], 3)
	tokenize_should(" Oleg Andreev ", %w[oleg andreev], 3)
	end

	it "should tokenize misc. strings" do
	tokenize_should("Leslie Feist - 1 2 3 4", ["leslie", "feist", "- 1 2", "1 2 3", "2 3 4"], 5)
	tokenize_should("I'm C++ programmer", %w[i'm m\ c c++ programmer], 3)
	tokenize_should("I'm C++ programmer", %w[i'm\ c m\ c++ c++\ p programmer], 5)

	tokenize_should("I am C++ programmer", %w[i\ a c++ programmer], 3)
	tokenize_should("I am C++ programmer", %w[am\ c+ c++\ p programmer], 5)
	end


	end # FindByToken::Indexer

	end # FindByToken