liamwhite/another_goddamn_searchparser.rb Secret

## another_goddamn_searchparser.rb
class Lexer

  TOKEN_LIST = [
    [:space, /\A([\s]+)/, true],
    [:and, /\A(AND|&&|,)/],
    [:not, /\A(NOT|!|-)/],
    [:or,  /\A(OR|\|\|)/],
    [:lparen, /\A\(/],
    [:rparen, /\A\)/],
    [:term, /\A([a-zA-Z\d\*\?\:\._]+)/],
  ]

  def lex(input)
    tokens = []

    while not input.empty?
      # Fetch token, push to array, slice input, repeat.
      token = match_token(input)
      tokens << token if not token[2]
      input = input[token[1].length .. -1]
    end

    tokens << [:eof, '$']
    tokens
  end

  def match_token(input)
    best_match = nil

    TOKEN_LIST.each do |rule|
      result = rule[1].match(input)

      # Result must not be nil to be a better match
      if result and (best_match.nil? or best_match[1].length < result.length)
        best_match = [rule[0], result.to_s, rule[2]]
      end
    end

    raise LexerError, "Could not match any token against \"#{input}\"." if best_match.nil?

    best_match
  end

end

class Parser
  attr_accessor :parsed, :requires_query

  def initialize(tokens, default_field, allowed_fields)
    @tokens = tokens
    @default_field = default_field
    @allowed_fields = allowed_fields
    @parsed = parse.build
  end

  def parse
    advance
    node = binary
    expect(:eof)
    return node
  end

  def binary
    node = BinaryASTNode.new
    node.left = unary

    # <unary> AND <binary>
    if accept(:and) or accept(:or)
      node.operator = @last[0]
      node.right = binary
      return node
    end

    # Flatten tree
    node.left
  end

  def unary
    # NOT <unary>
    if accept(:not)
      node = UnaryASTNode.new
      node.node = unary
      return node
    end

    # Flatten tree
    terminal
  end

  def terminal
    if accept(:term)
      # Join tags with spaces.
      buffer = @last[1]
      while accept(:term)
        buffer += " #{@last[1]}"
      end

      # Parenthesized tags.
      if accept(:lparen)
        buffer += " ("

        expect(:term)
        buffer += @last[1]
        while accept(:term)
          buffer += " #{@last[1]}"
        end

        expect :rparen
        buffer += ")"
      end

      node = TermASTNode.new(self, @default_field, @allowed_fields)
      node.term = buffer
      node
    elsif accept(:lparen)
      # ( <binary> )
      result = binary
      expect(:rparen)
      result
    else
      raise ParserError, "Unexpected token #{@current[0].inspect}!"
    end
  end

  private

  def advance
    @last = @current
    @current = @tokens.shift
  end

  def accept(type)
    if @current and @current[0] == type
      advance
      true
    else
      false
    end
  end

  def expect(type)
    accept(type) or raise ParserError, "Expected #{type.inspect}, got #{@current.inspect}"
  end

  class BinaryASTNode
    BINARY_TO_ES = {:and => :must, :or => :should}

    attr_accessor :left, :right, :operator

    def build
      {:bool => {BINARY_TO_ES[operator] => [@left.build, @right.build]}}
    end
  end

  class UnaryASTNode
    # Since there is only one possible value a unary tree node could  have,
    # we don't even bother with assigning it an operator here.
    attr_accessor :node

    def build
      {:not => @node.build}
    end
  end

  class TermASTNode
    attr_accessor :term

    def initialize(parser, default_field, allowed_fields)
      @parser = parser
      @default_field = default_field
      @allowed_fields = allowed_fields
    end

    def normalize_val(val, range=nil)
      if %w[lt gt gte lte].include?(range)
        return {range.to_sym => val}
      else
        return val
      end
    end

    # Checks any terms with colons for whether a field is specified, and
    # returns [field, value]
    def escape_colons
      args = nil

      @term.match(/\A(.*?[^\\]):(.*)\z/) do |m|
        field, val = m[1, 2]
        field.downcase!

        # Range query
        if field =~ /(.*)\.([gl]te?)\z/
          range_field = $1.to_sym
          return [range_field, normalize_val(val, $2)]
        end

        field = field.to_sym
        if not @allowed_fields.include?(field)
          return [@default_field, normalize_val("#{field}:#{val}")]
        end

        return [field, normalize_val(val)]
      end
    end

    def build
      wildcardable = !/\A"([^"]|\\")+"\z/.match(term)
      if not wildcardable
        @term = @term.slice(1, @term.size - 2)
      end

      field = nil
      if @term.include?(':')
        field, value = escape_colons
      end

      # No colon, or escape_colons encountered an unscaped colon
      if not field
        field, value = [@default_field, normalize_val(@term)]
      end

      # Range query
      if value.is_a? Hash
        return {range: {field => value}}
      elsif wildcardable and (value =~ /(?:^|[^\\])[\*\?]/)
        # '*' and '?' are wildcard characters in the right context
        value.gsub!(/\\([\A\*\?])/, '\1')
        @parser.requires_query = true
        if value == '*'
          # All-matching wildcard is just a match_all on the collection
          return {match_all: {}}
        end
        return {wildcard: {field => value}}
      else
        normalize_term! value, (not wildcardable)
        return {term: {field => value}}
      end
    end

    def normalize_term!(match, quoted)
      if quoted
        match.gsub!('\"', '"')
      else
        match.gsub!(/\\(.)/, '\1')
      end
    end
  end
end

class LexerError < StandardError; end
class ParserError < StandardError; end
	class Lexer

	TOKEN_LIST = [
	[:space, /\A([\s]+)/, true],
	[:and, /\A(AND\|&&\|,)/],
	[:not, /\A(NOT\|!\|-)/],
	[:or, /\A(OR\|\\|\\|)/],
	[:lparen, /\A\(/],
	[:rparen, /\A\)/],
	[:term, /\A([a-zA-Z\d\*\?\:\._]+)/],
	]

	def lex(input)
	tokens = []

	while not input.empty?
	# Fetch token, push to array, slice input, repeat.
	token = match_token(input)
	tokens << token if not token[2]
	input = input[token[1].length .. -1]
	end

	tokens << [:eof, '$']
	tokens
	end

	def match_token(input)
	best_match = nil

	TOKEN_LIST.each do \|rule\|
	result = rule[1].match(input)

	# Result must not be nil to be a better match
	if result and (best_match.nil? or best_match[1].length < result.length)
	best_match = [rule[0], result.to_s, rule[2]]
	end
	end

	raise LexerError, "Could not match any token against \"#{input}\"." if best_match.nil?

	best_match
	end

	end

	class Parser
	attr_accessor :parsed, :requires_query

	def initialize(tokens, default_field, allowed_fields)
	@tokens = tokens
	@default_field = default_field
	@allowed_fields = allowed_fields
	@parsed = parse.build
	end

	def parse
	advance
	node = binary
	expect(:eof)
	return node
	end

	def binary
	node = BinaryASTNode.new
	node.left = unary

	# <unary> AND <binary>
	if accept(:and) or accept(:or)
	node.operator = @last[0]
	node.right = binary
	return node
	end

	# Flatten tree
	node.left
	end

	def unary
	# NOT <unary>
	if accept(:not)
	node = UnaryASTNode.new
	node.node = unary
	return node
	end

	# Flatten tree
	terminal
	end

	def terminal
	if accept(:term)
	# Join tags with spaces.
	buffer = @last[1]
	while accept(:term)
	buffer += " #{@last[1]}"
	end

	# Parenthesized tags.
	if accept(:lparen)
	buffer += " ("

	expect(:term)
	buffer += @last[1]
	while accept(:term)
	buffer += " #{@last[1]}"
	end

	expect :rparen
	buffer += ")"
	end

	node = TermASTNode.new(self, @default_field, @allowed_fields)
	node.term = buffer
	node
	elsif accept(:lparen)
	# ( <binary> )
	result = binary
	expect(:rparen)
	result
	else
	raise ParserError, "Unexpected token #{@current[0].inspect}!"
	end
	end

	private

	def advance
	@last = @current
	@current = @tokens.shift
	end

	def accept(type)
	if @current and @current[0] == type
	advance
	true
	else
	false
	end
	end

	def expect(type)
	accept(type) or raise ParserError, "Expected #{type.inspect}, got #{@current.inspect}"
	end

	class BinaryASTNode
	BINARY_TO_ES = {:and => :must, :or => :should}

	attr_accessor :left, :right, :operator

	def build
	{:bool => {BINARY_TO_ES[operator] => [@left.build, @right.build]}}
	end
	end

	class UnaryASTNode
	# Since there is only one possible value a unary tree node could have,
	# we don't even bother with assigning it an operator here.
	attr_accessor :node

	def build
	{:not => @node.build}
	end
	end

	class TermASTNode
	attr_accessor :term

	def initialize(parser, default_field, allowed_fields)
	@parser = parser
	@default_field = default_field
	@allowed_fields = allowed_fields
	end

	def normalize_val(val, range=nil)
	if %w[lt gt gte lte].include?(range)
	return {range.to_sym => val}
	else
	return val
	end
	end

	# Checks any terms with colons for whether a field is specified, and
	# returns [field, value]
	def escape_colons
	args = nil

	@term.match(/\A(.?[^\\]):(.)\z/) do \|m\|
	field, val = m[1, 2]
	field.downcase!

	# Range query
	if field =~ /(.*)\.([gl]te?)\z/
	range_field = $1.to_sym
	return [range_field, normalize_val(val, $2)]
	end

	field = field.to_sym
	if not @allowed_fields.include?(field)
	return [@default_field, normalize_val("#{field}:#{val}")]
	end

	return [field, normalize_val(val)]
	end
	end

	def build
	wildcardable = !/\A"([^"]\|\\")+"\z/.match(term)
	if not wildcardable
	@term = @term.slice(1, @term.size - 2)
	end

	field = nil
	if @term.include?(':')
	field, value = escape_colons
	end

	# No colon, or escape_colons encountered an unscaped colon
	if not field
	field, value = [@default_field, normalize_val(@term)]
	end

	# Range query
	if value.is_a? Hash
	return {range: {field => value}}
	elsif wildcardable and (value =~ /(?:^\|[^\\])[\*\?]/)
	# '*' and '?' are wildcard characters in the right context
	value.gsub!(/\\([\A\*\?])/, '\1')
	@parser.requires_query = true
	if value == '*'
	# All-matching wildcard is just a match_all on the collection
	return {match_all: {}}
	end
	return {wildcard: {field => value}}
	else
	normalize_term! value, (not wildcardable)
	return {term: {field => value}}
	end
	end

	def normalize_term!(match, quoted)
	if quoted
	match.gsub!('\"', '"')
	else
	match.gsub!(/\\(.)/, '\1')
	end
	end
	end
	end

	class LexerError < StandardError; end
	class ParserError < StandardError; end