Skip to content

Instantly share code, notes, and snippets.

@paukul
Created September 26, 2011 12:25
Show Gist options
  • Save paukul/1242110 to your computer and use it in GitHub Desktop.
Save paukul/1242110 to your computer and use it in GitHub Desktop.
solr query sanitizer using a statemachine (like a baws!)
# the only dependencies are http://rubygems.org/gems/transitions
# and I think active support
module Search
module QueryBuilderHelper
def sanitize_field_for_solr_query(query)
if query.is_a?(Hash)
query.inject({}) do |result, query_component|
result[query_component.first] = query_component.first == '*' ? query_component.last : QueryField.sanitize(query_component.last)
result
end
else
QueryField.sanitize(query)
end
end
end
end
require 'transitions'
module Search
class QueryField
include Transitions
def initialize(query)
@index = 0
@output_string = ""
@current_char = nil
@query = query
end
state_machine do
# state :idle
state :beginning_of_word
state :unescaped_outside_of_quotation
state :escaped_outside_of_quotation
state :unescaped_inside_of_quotation
state :escaped_inside_of_quotation
state :end_of_string
event :normal_character do
transitions :from => :beginning_of_word, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print
transitions :from => :unescaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print
transitions :from => :escaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :escaped_print
transitions :from => :unescaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print
transitions :from => :escaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print
end
event :special_character do
transitions :from => :beginning_of_word, :to => :unescaped_outside_of_quotation, :on_transition => :escaped_print
transitions :from => :unescaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :escaped_print
transitions :from => :escaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print
transitions :from => :unescaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print
transitions :from => :escaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print
end
event :star do
transitions :from => :beginning_of_word, :to => :unescaped_outside_of_quotation, :on_transition => :escaped_print
transitions :from => :unescaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print
transitions :from => :escaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print
transitions :from => :unescaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print
transitions :from => :escaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print
end
event :whitespace do
transitions :from => :beginning_of_word, :to => :beginning_of_word, :on_transition => :unescaped_print
transitions :from => :unescaped_outside_of_quotation, :to => :beginning_of_word, :on_transition => :unescaped_print
transitions :from => :escaped_outside_of_quotation, :to => :beginning_of_word, :on_transition => :escaped_print
transitions :from => :unescaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print
transitions :from => :escaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print
end
event :backslash do
transitions :from => :beginning_of_word, :to => :escaped_outside_of_quotation, :on_transition => :unescaped_print
transitions :from => :unescaped_outside_of_quotation, :to => :escaped_outside_of_quotation, :on_transition => :unescaped_print
transitions :from => :escaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print
transitions :from => :unescaped_inside_of_quotation, :to => :escaped_inside_of_quotation, :on_transition => :unescaped_print
transitions :from => :escaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print
end
event :quote do
transitions :from => :beginning_of_word, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print
transitions :from => :unescaped_outside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print
transitions :from => :escaped_outside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print
transitions :from => :unescaped_inside_of_quotation, :to => :unescaped_outside_of_quotation, :on_transition => :unescaped_print
transitions :from => :escaped_inside_of_quotation, :to => :unescaped_inside_of_quotation, :on_transition => :unescaped_print
end
event :eos_character do
transitions :from => :beginning_of_word, :to => :end_of_string
transitions :from => :unescaped_outside_of_quotation, :to => :end_of_string
transitions :from => :escaped_outside_of_quotation, :to => :end_of_string
transitions :from => :unescaped_inside_of_quotation, :to => :end_of_string, :on_transition => :quote_print
transitions :from => :escaped_inside_of_quotation, :to => :end_of_string, :on_transition => :escape_and_quote_print
end
end
def escaped_print
@output_string << "\\#{@current_char}"
end
def unescaped_print
@output_string << @current_char
end
def escape_and_quote_print
@output_string << "\\\"#{@current_char}"
end
def quote_print
@output_string << "\"#{@current_char}"
end
def process_char
case @current_char
when /\+|-|&|\||\!|\(|\)|\{|\}|\[|\]|\^|~|\?|:/
special_character
when /\s/
whitespace
when "*"
star
when "\\"
backslash
when '"'
quote
else
normal_character
end
end
def sanitize
return @query unless @query.is_a?(String)
while((@current_char = @query[@index]) != nil) do
@current_char = @current_char.chr
process_char
@index += 1
end
eos_character
return @output_string
end
def self.sanitize(query)
new(query).sanitize
end
end
end
class TheSolrQuerySanitizer < ActiveSupport::TestCase
include Search::QueryBuilderHelper
test "should eat hashes for breakfast" do
params = {:what => "*", :where => '"""'}
assert_equal({:what => '\\*', :where => '""""'}, sanitize_field_for_solr_query(params))
end
test "should escape a lonely asterisk" do
assert_equal 'leiter \*', sanitize_field_for_solr_query('leiter *')
assert_equal '\*', sanitize_field_for_solr_query('*')
end
test "should not escape a correctly used asterisk" do
assert_equal 'manager*', sanitize_field_for_solr_query('manager*')
end
test "should escape parantheses" do
assert_equal '\(peter \)\) a\)\)', sanitize_field_for_solr_query('(peter \)) a))')
assert_equal '\(peter \)\) a\)\)', sanitize_field_for_solr_query('(peter )) a))')
assert_equal '\(peter \)\) a\)\)', sanitize_field_for_solr_query('(peter \)) a))')
assert_equal '\(\)', sanitize_field_for_solr_query('()')
end
test "should escape all kind of crazy solr special character wizzardry" do
'+-&|!(){}[]^~?:'.each_char do |char|
assert_equal "\\#{char}", sanitize_field_for_solr_query(char)
end
end
test "should close trailing quote if uneven number of quotes" do
assert_equal '""""', sanitize_field_for_solr_query('"""')
assert_equal '""', sanitize_field_for_solr_query('""')
assert_equal '""a""', sanitize_field_for_solr_query('""a"')
end
test "should escape backslashes that don't escape a special character" do
assert_equal '\\\\ \(', sanitize_field_for_solr_query('\ \(')
assert_equal '\\\\ \(', sanitize_field_for_solr_query('\\\\ \(')
assert_equal '\\\\\\\\ \(', sanitize_field_for_solr_query('\\\\\ \(')
assert_equal '\\\\\\\\ \(', sanitize_field_for_solr_query('\\\\\\\\ \(')
assert_equal '\( \(', sanitize_field_for_solr_query('\( (')
assert_equal '\\\\\( \(', sanitize_field_for_solr_query('\\\( \(')
end
test "should not assplode on empty strings" do
assert_equal "", sanitize_field_for_solr_query("")
end
test "should like escaped 'quotes in quotes'(tm)" do
assert_equal '"hallo \" ha! \" "', sanitize_field_for_solr_query('"hallo \" ha! \" "')
end
test "should handle stars inside and and outside of quotes" do
assert_equal ' \* " * "', sanitize_field_for_solr_query(' * " * "')
end
test "'should love backslashes as last character inside unmatched quotes\\" do
assert_equal '"ha \\\\"', sanitize_field_for_solr_query('"ha \\')
end
test "should escape unescaped backslashes before whitespace" do
assert_equal ' ha\\\\ ', sanitize_field_for_solr_query(' ha\\ ')
assert_equal " ha\\\\\t", sanitize_field_for_solr_query(" ha\\\t")
end
test "should make sweet sweet love to stuff other than strings (which includes Hendrik)" do
assert_equal :hendrik, sanitize_field_for_solr_query(:hendrik)
assert_equal nil, sanitize_field_for_solr_query(nil)
assert_equal 1, sanitize_field_for_solr_query(1)
assert_equal [:foo, 1, "baz"], sanitize_field_for_solr_query([:foo, 1, "baz"])
end
test "should crush wicked qa test posting titles" do
assert_equal(
'Job Ad LOGO \(14.01.2009\) \- Job Position \( ‘ ` \| / \\\\ , ; \: \& < > \^ \* \? \) \(äöüß\) \(Ромашка\)',
sanitize_field_for_solr_query("Job Ad LOGO (14.01.2009) - Job Position ( ‘ ` | / \\ , ; : & < > ^ * ? ) (äöüß) (Ромашка)"))
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment