Created
November 10, 2008 12:07
-
-
Save oleganza/23480 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# class Person | |
# include FindByToken | |
# token_search :name, :nickname, :email | |
# end | |
# | |
# Person.find_by_token("oleg andreev", :limit => 100) => [...] | |
# | |
module FindByToken | |
DEFAULT_LIMIT = 100 | |
DEFAULT_MIN_LENGTH = 3 | |
DEFAULT_MAX_LENGTH = 64 | |
def self.included(mod) | |
mod.extend(Indexer) | |
mod.extend(Finder) | |
end | |
# This is used to configure model class with DSL methods | |
module Indexer | |
attr_accessor :token_search_model_inclusions, :token_search_model | |
# Define what to include into token search model by default: | |
# token_search_inclusions do | |
# include SomeModule | |
# include AnotherModule | |
# end | |
def token_search_inclusions(&blk) | |
@token_search_model_inclusions = blk | |
end | |
def token_search(*fields) | |
proc_inclusions = @token_search_model_inclusions || F.default_inclusions | |
min_length = DEFAULT_MIN_LENGTH | |
name = self.name.to_s | |
class_name = F.class_name(self, fields) | |
assocs_name = class_name.snake_case.plural.to_sym | |
model = F.create_model(class_name, &proc_inclusions) | |
F.create_properties(name, model) | |
F.setup_association(assocs_name, self) | |
F.setup_hooks(assocs_name, fields, self, model, min_length) | |
@token_search_model = model | |
end | |
module F extend self | |
def name_from_fields(fields) | |
fields.map{|f| f.to_s }.join("_") | |
end | |
def class_name(model, fields) | |
fields_name = name_from_fields(fields).camel_case | |
"TokenSearch#{model.name}#{fields_name}Entry" | |
end | |
def create_model(class_name, &blk) | |
cls = Class.new | |
eval("::#{class_name} = cls") # create a top-level const | |
cls.module_eval(&blk) | |
cls | |
end | |
def create_properties(class_name, search_model) | |
size = DEFAULT_MAX_LENGTH | |
search_model.module_eval do | |
property :token, String, :length => size, :index => true | |
belongs_to :entity, :class_name => class_name, :child_key => [ :entity_id ], :index => true | |
def token=(t) | |
attribute_set(:token, t.to_s[0, DEFAULT_MAX_LENGTH]) | |
end | |
end | |
end | |
def setup_association(assocs_name, model) | |
model.module_eval do | |
has n, assocs_name | |
end | |
end | |
def setup_hooks(assocs_name, fields, model, search_model, min_length) | |
model.after(:save) do | |
# attrs = dirty_attributes.keys.map{|p| p.name } | |
# p [:after_save, fields, attrs, self] | |
# unless (attrs & fields).empty? # indexed tokens updated? | |
# 1. Remove all previous tokens | |
send(assocs_name).destroy! | |
# 2. Add new tokens | |
F.add_tokens(self, fields, search_model, min_length) | |
# end | |
end | |
end | |
def add_tokens(entity, fields, search_model, min_length) | |
fields.map do |field| | |
tokenize(entity.send(field).to_s, min_length) | |
end. | |
flatten. | |
uniq. | |
each do |token| | |
sm = search_model.new | |
sm.token = token | |
sm.entity = entity | |
sm.save | |
end | |
end | |
ONE_SPACE = " " | |
MANY_SPACES = " "*16 | |
def tokenize(text, min_length) | |
text = text.gsub(/\s+/u, ONE_SPACE).strip.downcase | |
tokens = [] | |
offset = 0 | |
while i = text.index(/[@\w\.\-_]/u, offset) | |
j = text.index(/[^@\w\.\-_]/u, i) | |
l = (j || text.length) - i | |
tokens << text[i, l > min_length ? l : min_length].strip | |
offset = j or break | |
end | |
tokens.select{|t| t.size >= min_length } | |
end | |
def default_inclusions | |
proc do | |
include ::DataMapper::Resource | |
property :id, ::DataMapper::Types::Serial | |
end | |
end | |
end # F - private functions | |
end # Indexer | |
# This is used to find model instances using a string of tokens | |
module Finder | |
IF = Indexer::F | |
ORDER = [ :token ] | |
def find_by_token(query, dm_query = {}) | |
tokens = IF.tokenize(query, DEFAULT_MIN_LENGTH) | |
arr = tokens.inject(nil) do |set, token| | |
subset = @token_search_model.all({ | |
:token.gte => token, | |
:token.lt => token.next, | |
:order => ORDER, | |
:limit => DEFAULT_LIMIT | |
}.merge(dm_query)) | |
subset = subset.map do |relation| | |
relation.entity | |
end | |
set ? set & subset : subset | |
end || [] | |
arr.uniq! | |
arr | |
end | |
end # Finder | |
end # FindByToken |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require File.join(File.dirname(__FILE__), "spec_helper") | |
describe FindByToken do | |
describe FindByToken::Indexer do | |
before(:each) do | |
end | |
def tokenize_should(a, b, m = 5) | |
FindByToken::Indexer::F.tokenize(a, m).should == b | |
end | |
it "should tokenize single-word strings" do | |
tokenize_should("", %w[], 3) | |
tokenize_should("Xy", %w[], 3) | |
tokenize_should("1.2", %w[1.2], 3) | |
tokenize_should("Xyz", %w[xyz], 3) | |
tokenize_should("Oleg", %w[oleg], 3) | |
tokenize_should("1Oleg", %w[1oleg], 3) | |
tokenize_should("OlegAndreev", %w[olegandreev], 3) | |
end | |
it "should tokenize double-word strings" do | |
tokenize_should(" Oleg - Andreev ", %w[oleg -\ a andreev], 3) | |
tokenize_should(" Oleg-Andreev ", %w[oleg-andreev], 3) | |
tokenize_should(" Oleg Andreev ", %w[oleg andreev], 3) | |
end | |
it "should tokenize misc. strings" do | |
tokenize_should("Leslie Feist - 1 2 3 4", ["leslie", "feist", "- 1 2", "1 2 3", "2 3 4"], 5) | |
tokenize_should("I'm C++ programmer", %w[i'm m\ c c++ programmer], 3) | |
tokenize_should("I'm C++ programmer", %w[i'm\ c m\ c++ c++\ p programmer], 5) | |
tokenize_should("I am C++ programmer", %w[i\ a c++ programmer], 3) | |
tokenize_should("I am C++ programmer", %w[am\ c+ c++\ p programmer], 5) | |
end | |
end # FindByToken::Indexer | |
end # FindByToken |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment