Created
September 14, 2010 21:31
-
-
Save albertoperdomo/579816 to your computer and use it in GitHub Desktop.
Normalize terms for search in DB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Así he implementado tu propuesta como scope: | |
named_scope :normalized_name_begins_with, lambda { |term| | |
map = [ | |
"âãäåāăąÁÂÃÄÅĀĂĄèééêëēĕėęěĒĔĖĘĚìíîïìĩīĭÌÍÎÏÌĨĪĬóôõöōŏőÒÓÔÕÖŌŎŐùúûüũūŭůÙÚÛÜŨŪŬŮñÑçÇüÜ".mb_chars, | |
"aaaaaaaaaaaaaaaeeeeeeeeeeeeeeeiiiiiiiiiiiiiiiiooooooooooooooouuuuuuuuuuuuuuuunnccuu"] | |
# ActiveSupport::Multibyte::Chars#tr broken? | |
adapted_word = "%#{term.mb_chars.split(//).map {|e| (p = map[0].index(e)) ? map[1][p,1] : e.to_s }.join.downcase}%" | |
{ | |
:conditions => ["lower(translate(localized_city_names.name, '#{map[0]}', '#{map[1]}')) like #{LocalizedCityName.connection.quote term}"] | |
} | |
} | |
# params[:name] = 'tel' | |
# params[:limit] = 10 | |
# Esta es la consulta en Ruby | |
LocalizedCityName.normalized_name_begins_with(params[:name]).descend_by_country_match_and_population(current_user.city.country_iso_code_two_letters).all(params[:limit]) | |
# Esta es la consulta que genera con translate | |
LocalizedCityName Load (4940.1ms) SELECT "localized_city_names".*, ("cities".country_iso_code_two_letters = 'ES') AS in_country FROM "localized_city_names" INNER JOIN "cities" ON "cities".id = "localized_city_names".city_id WHERE (lower(translate(localized_city_names.name, 'âãäåāăąÁÂÃÄÅĀĂĄèééêëēĕėęěĒĔĖĘĚìíîïìĩīĭÌÍÎÏÌĨĪĬóôõöōŏőÒÓÔÕÖŌŎŐùúûüũūŭůÙÚÛÜŨŪŬŮñÑçÇüÜ', 'aaaaaaaaaaaaaaaeeeeeeeeeeeeeeeiiiiiiiiiiiiiiiiooooooooooooooouuuuuuuuuuuuuuuunnccuu')) like E'tel%') ORDER BY in_country DESC, cities.population DESC | |
# Un problema que tengo es que por alguna razón ignora el limit | |
# Por otro lado tarda bastante, casi 5 segundos | |
# Devuelve 54 resultados | |
#EXPLAIN | |
"Sort (cost=6234.21..6235.64 rows=571 width=59) (actual time=4911.203..4911.211 rows=54 loops=1)" | |
" Sort Key: (((cities.country_iso_code_two_letters)::text = 'ES'::text)), cities.population" | |
" Sort Method: quicksort Memory: 22kB" | |
" -> Nested Loop (cost=0.00..6208.07 rows=571 width=59) (actual time=2.433..4910.833 rows=54 loops=1)" | |
" -> Seq Scan on localized_city_names (cost=0.00..3093.01 rows=571 width=48) (actual time=2.409..4909.622 rows=54 loops=1)" | |
" Filter: (lower(translate((name)::text, 'âãäåāăąÁÂÃÄÅĀĂĄèééêëēĕėęěĒĔĖĘĚìíîïìĩīĭÌÍÎÏÌĨĪĬóôõöōŏőÒÓÔÕÖŌŎŐùúûüũūŭůÙÚÛÜŨŪŬŮñÑçÇüÜ'::text, 'aaaaaaaaaaaaaaaeeeeeeeeeeeeeeeiiiiiiiiiiiiiiiiooooooooooooooouuuuuuuuuuuuuuuunnccuu'::text)) ~~ 'tel%'::text)" | |
" -> Index Scan using cities_pkey on cities (cost=0.00..5.44 rows=1 width=15) (actual time=0.015..0.016 rows=1 loops=54)" | |
" Index Cond: (cities.id = localized_city_names.city_id)" | |
"Total runtime: 4911.370 ms" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Así he implementado el scope usando regex basándome en un post que encontré en Google. Tiene menos mapeos de caracteres, igual la diferencia en tiempo de consulta se debe a eso? | |
#La URL del post original es: http://blog.nelsonsilva.eu/2008/12/15/rails-postgresql-accent-insensitive-search/ | |
named_scope :normalized_name_begins_with, lambda { |term| | |
{ | |
:conditions => ['localized_city_names.name ~* ?', "^" + term.downcase.accent_insensitive_regexp] | |
} | |
} | |
#He parcheado la clase string de la siguiente forma: | |
class String | |
@@ACCENT_INSENSITIVE_REGEXES=[ | |
"(a|á|à|â|ã|A|Á|À|Â|Ã)","(e|é|è|ê|E|É|È|Ê)","(i|í|ì|I|Í|Ì)","(o|ó|ò|ô|õ|O|Ó|Ò|Ô|Õ)","(u|ú|ù|U|Ú|Ù)","(c|ç|Ç)" | |
] | |
# @@ACCENT_INSENSITIVE_REGEXES=[ | |
# "(a|á|à|â|ã)","(e|é|è|ê)","(i|í|ì)","(o|ó|ò|ô|õ)","(u|ú|ù)","(c|ç)" | |
# ] | |
def accent_insensitive_regexp | |
res=self | |
@@ACCENT_INSENSITIVE_REGEXES.each {|exp| | |
res.gsub! Regexp.new(exp), exp | |
} | |
res | |
end | |
alias_method :old_upcase, :upcase | |
alias_method :old_downcase, :downcase | |
@@DOWNCASE_ACCENT_CHARS="çàáèéìíòóùúâêôãõ" | |
@@UPCASE_ACCENT_CHARS="ÇÀÁÈÉÌÍÒÓÙÚÂÊÔÃÕ" | |
def upcase | |
self.old_upcase.tr(@@DOWNCASE_ACCENT_CHARS,@@UPCASE_ACCENT_CHARS) | |
end | |
def downcase | |
self.old_downcase.tr(@@UPCASE_ACCENT_CHARS,@@DOWNCASE_ACCENT_CHARS) | |
end | |
end | |
# params[:name] = 'tel' | |
# params[:limit] = 10 | |
# El otro scope que uso en la consulta | |
named_scope :descend_by_country_match_and_population, lambda { |country_code| | |
{ | |
:select => %{"localized_city_names".*, ("cities".country_iso_code_two_letters = 'ES') AS in_country}, | |
:joins => :city, | |
:order => "in_country DESC, cities.population DESC" | |
} | |
} | |
# Esta es la consulta en Ruby | |
LocalizedCityName.normalized_name_begins_with(params[:name]).descend_by_country_match_and_population(current_user.city.country_iso_code_two_letters).all(params[:limit]) | |
# Esta es la consulta que genera con regex | |
LocalizedCityName Load (201.2ms) SELECT "localized_city_names".*, ("cities".country_iso_code_two_letters = 'ES') AS in_country FROM "localized_city_names" INNER JOIN "cities" ON "cities".id = "localized_city_names".city_id WHERE (localized_city_names.name ~* E'^t(e|é|è|ê|E|É|È|Ê)l') ORDER BY in_country DESC, cities.population DESC | |
# EXPLAIN | |
"Sort (cost=2612.98..2613.01 rows=11 width=59) (actual time=191.220..191.226 rows=54 loops=1)" | |
" Sort Key: (((cities.country_iso_code_two_letters)::text = 'ES'::text)), cities.population" | |
" Sort Method: quicksort Memory: 22kB" | |
" -> Nested Loop (cost=0.00..2612.79 rows=11 width=59) (actual time=0.128..190.946 rows=54 loops=1)" | |
" -> Seq Scan on localized_city_names (cost=0.00..2521.57 rows=11 width=48) (actual time=0.109..190.143 rows=54 loops=1)" | |
" Filter: ((name)::text ~* '^t(e|é|è|ê|E|É|È|Ê)l'::text)" | |
" -> Index Scan using cities_pkey on cities (cost=0.00..8.28 rows=1 width=15) (actual time=0.010..0.011 rows=1 loops=54)" | |
" Index Cond: (cities.id = localized_city_names.city_id)" | |
"Total runtime: 191.390 ms" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment