Skip to content

Instantly share code, notes, and snippets.

@oleganza
Created November 10, 2008 12:07
Show Gist options
  • Save oleganza/23480 to your computer and use it in GitHub Desktop.
Save oleganza/23480 to your computer and use it in GitHub Desktop.
# class Person
# include FindByToken
# token_search :name, :nickname, :email
# end
#
# Person.find_by_token("oleg andreev", :limit => 100) => [...]
#
module FindByToken
DEFAULT_LIMIT = 100
DEFAULT_MIN_LENGTH = 3
DEFAULT_MAX_LENGTH = 64
def self.included(mod)
mod.extend(Indexer)
mod.extend(Finder)
end
# This is used to configure model class with DSL methods
module Indexer
attr_accessor :token_search_model_inclusions, :token_search_model
# Define what to include into token search model by default:
# token_search_inclusions do
# include SomeModule
# include AnotherModule
# end
def token_search_inclusions(&blk)
@token_search_model_inclusions = blk
end
def token_search(*fields)
proc_inclusions = @token_search_model_inclusions || F.default_inclusions
min_length = DEFAULT_MIN_LENGTH
name = self.name.to_s
class_name = F.class_name(self, fields)
assocs_name = class_name.snake_case.plural.to_sym
model = F.create_model(class_name, &proc_inclusions)
F.create_properties(name, model)
F.setup_association(assocs_name, self)
F.setup_hooks(assocs_name, fields, self, model, min_length)
@token_search_model = model
end
module F extend self
def name_from_fields(fields)
fields.map{|f| f.to_s }.join("_")
end
def class_name(model, fields)
fields_name = name_from_fields(fields).camel_case
"TokenSearch#{model.name}#{fields_name}Entry"
end
def create_model(class_name, &blk)
cls = Class.new
eval("::#{class_name} = cls") # create a top-level const
cls.module_eval(&blk)
cls
end
def create_properties(class_name, search_model)
size = DEFAULT_MAX_LENGTH
search_model.module_eval do
property :token, String, :length => size, :index => true
belongs_to :entity, :class_name => class_name, :child_key => [ :entity_id ], :index => true
def token=(t)
attribute_set(:token, t.to_s[0, DEFAULT_MAX_LENGTH])
end
end
end
def setup_association(assocs_name, model)
model.module_eval do
has n, assocs_name
end
end
def setup_hooks(assocs_name, fields, model, search_model, min_length)
model.after(:save) do
# attrs = dirty_attributes.keys.map{|p| p.name }
# p [:after_save, fields, attrs, self]
# unless (attrs & fields).empty? # indexed tokens updated?
# 1. Remove all previous tokens
send(assocs_name).destroy!
# 2. Add new tokens
F.add_tokens(self, fields, search_model, min_length)
# end
end
end
def add_tokens(entity, fields, search_model, min_length)
fields.map do |field|
tokenize(entity.send(field).to_s, min_length)
end.
flatten.
uniq.
each do |token|
sm = search_model.new
sm.token = token
sm.entity = entity
sm.save
end
end
ONE_SPACE = " "
MANY_SPACES = " "*16
def tokenize(text, min_length)
text = text.gsub(/\s+/u, ONE_SPACE).strip.downcase
tokens = []
offset = 0
while i = text.index(/[@\w\.\-_]/u, offset)
j = text.index(/[^@\w\.\-_]/u, i)
l = (j || text.length) - i
tokens << text[i, l > min_length ? l : min_length].strip
offset = j or break
end
tokens.select{|t| t.size >= min_length }
end
def default_inclusions
proc do
include ::DataMapper::Resource
property :id, ::DataMapper::Types::Serial
end
end
end # F - private functions
end # Indexer
# This is used to find model instances using a string of tokens
module Finder
IF = Indexer::F
ORDER = [ :token ]
def find_by_token(query, dm_query = {})
tokens = IF.tokenize(query, DEFAULT_MIN_LENGTH)
arr = tokens.inject(nil) do |set, token|
subset = @token_search_model.all({
:token.gte => token,
:token.lt => token.next,
:order => ORDER,
:limit => DEFAULT_LIMIT
}.merge(dm_query))
subset = subset.map do |relation|
relation.entity
end
set ? set & subset : subset
end || []
arr.uniq!
arr
end
end # Finder
end # FindByToken
require File.join(File.dirname(__FILE__), "spec_helper")
describe FindByToken do
describe FindByToken::Indexer do
before(:each) do
end
def tokenize_should(a, b, m = 5)
FindByToken::Indexer::F.tokenize(a, m).should == b
end
it "should tokenize single-word strings" do
tokenize_should("", %w[], 3)
tokenize_should("Xy", %w[], 3)
tokenize_should("1.2", %w[1.2], 3)
tokenize_should("Xyz", %w[xyz], 3)
tokenize_should("Oleg", %w[oleg], 3)
tokenize_should("1Oleg", %w[1oleg], 3)
tokenize_should("OlegAndreev", %w[olegandreev], 3)
end
it "should tokenize double-word strings" do
tokenize_should(" Oleg - Andreev ", %w[oleg -\ a andreev], 3)
tokenize_should(" Oleg-Andreev ", %w[oleg-andreev], 3)
tokenize_should(" Oleg Andreev ", %w[oleg andreev], 3)
end
it "should tokenize misc. strings" do
tokenize_should("Leslie Feist - 1 2 3 4", ["leslie", "feist", "- 1 2", "1 2 3", "2 3 4"], 5)
tokenize_should("I'm C++ programmer", %w[i'm m\ c c++ programmer], 3)
tokenize_should("I'm C++ programmer", %w[i'm\ c m\ c++ c++\ p programmer], 5)
tokenize_should("I am C++ programmer", %w[i\ a c++ programmer], 3)
tokenize_should("I am C++ programmer", %w[am\ c+ c++\ p programmer], 5)
end
end # FindByToken::Indexer
end # FindByToken
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment