Skip to content

Instantly share code, notes, and snippets.

@liamwhite
Last active September 26, 2015 12:15
Show Gist options
  • Save liamwhite/54e331abe1a060d2196a to your computer and use it in GitHub Desktop.
Save liamwhite/54e331abe1a060d2196a to your computer and use it in GitHub Desktop.
I have a morbid fascination with parsers by this point.
class Lexer
TOKEN_LIST = [
[:space, /\A([\s]+)/, true],
[:and, /\A(AND|&&|,)/],
[:not, /\A(NOT|!|-)/],
[:or, /\A(OR|\|\|)/],
[:lparen, /\A\(/],
[:rparen, /\A\)/],
[:term, /\A([a-zA-Z\d\*\?\:\._]+)/],
]
def lex(input)
tokens = []
while not input.empty?
# Fetch token, push to array, slice input, repeat.
token = match_token(input)
tokens << token if not token[2]
input = input[token[1].length .. -1]
end
tokens << [:eof, '$']
tokens
end
def match_token(input)
best_match = nil
TOKEN_LIST.each do |rule|
result = rule[1].match(input)
# Result must not be nil to be a better match
if result and (best_match.nil? or best_match[1].length < result.length)
best_match = [rule[0], result.to_s, rule[2]]
end
end
raise LexerError, "Could not match any token against \"#{input}\"." if best_match.nil?
best_match
end
end
class Parser
attr_accessor :parsed, :requires_query
def initialize(tokens, default_field, allowed_fields)
@tokens = tokens
@default_field = default_field
@allowed_fields = allowed_fields
@parsed = parse.build
end
def parse
advance
node = binary
expect(:eof)
return node
end
def binary
node = BinaryASTNode.new
node.left = unary
# <unary> AND <binary>
if accept(:and) or accept(:or)
node.operator = @last[0]
node.right = binary
return node
end
# Flatten tree
node.left
end
def unary
# NOT <unary>
if accept(:not)
node = UnaryASTNode.new
node.node = unary
return node
end
# Flatten tree
terminal
end
def terminal
if accept(:term)
# Join tags with spaces.
buffer = @last[1]
while accept(:term)
buffer += " #{@last[1]}"
end
# Parenthesized tags.
if accept(:lparen)
buffer += " ("
expect(:term)
buffer += @last[1]
while accept(:term)
buffer += " #{@last[1]}"
end
expect :rparen
buffer += ")"
end
node = TermASTNode.new(self, @default_field, @allowed_fields)
node.term = buffer
node
elsif accept(:lparen)
# ( <binary> )
result = binary
expect(:rparen)
result
else
raise ParserError, "Unexpected token #{@current[0].inspect}!"
end
end
private
def advance
@last = @current
@current = @tokens.shift
end
def accept(type)
if @current and @current[0] == type
advance
true
else
false
end
end
def expect(type)
accept(type) or raise ParserError, "Expected #{type.inspect}, got #{@current.inspect}"
end
class BinaryASTNode
BINARY_TO_ES = {:and => :must, :or => :should}
attr_accessor :left, :right, :operator
def build
{:bool => {BINARY_TO_ES[operator] => [@left.build, @right.build]}}
end
end
class UnaryASTNode
# Since there is only one possible value a unary tree node could have,
# we don't even bother with assigning it an operator here.
attr_accessor :node
def build
{:not => @node.build}
end
end
class TermASTNode
attr_accessor :term
def initialize(parser, default_field, allowed_fields)
@parser = parser
@default_field = default_field
@allowed_fields = allowed_fields
end
def normalize_val(val, range=nil)
if %w[lt gt gte lte].include?(range)
return {range.to_sym => val}
else
return val
end
end
# Checks any terms with colons for whether a field is specified, and
# returns [field, value]
def escape_colons
args = nil
@term.match(/\A(.*?[^\\]):(.*)\z/) do |m|
field, val = m[1, 2]
field.downcase!
# Range query
if field =~ /(.*)\.([gl]te?)\z/
range_field = $1.to_sym
return [range_field, normalize_val(val, $2)]
end
field = field.to_sym
if not @allowed_fields.include?(field)
return [@default_field, normalize_val("#{field}:#{val}")]
end
return [field, normalize_val(val)]
end
end
def build
wildcardable = !/\A"([^"]|\\")+"\z/.match(term)
if not wildcardable
@term = @term.slice(1, @term.size - 2)
end
field = nil
if @term.include?(':')
field, value = escape_colons
end
# No colon, or escape_colons encountered an unscaped colon
if not field
field, value = [@default_field, normalize_val(@term)]
end
# Range query
if value.is_a? Hash
return {range: {field => value}}
elsif wildcardable and (value =~ /(?:^|[^\\])[\*\?]/)
# '*' and '?' are wildcard characters in the right context
value.gsub!(/\\([\A\*\?])/, '\1')
@parser.requires_query = true
if value == '*'
# All-matching wildcard is just a match_all on the collection
return {match_all: {}}
end
return {wildcard: {field => value}}
else
normalize_term! value, (not wildcardable)
return {term: {field => value}}
end
end
def normalize_term!(match, quoted)
if quoted
match.gsub!('\"', '"')
else
match.gsub!(/\\(.)/, '\1')
end
end
end
end
class LexerError < StandardError; end
class ParserError < StandardError; end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment