I have a morbid fascination with parsers by this point.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Lexer | |
TOKEN_LIST = [ | |
[:space, /\A([\s]+)/, true], | |
[:and, /\A(AND|&&|,)/], | |
[:not, /\A(NOT|!|-)/], | |
[:or, /\A(OR|\|\|)/], | |
[:lparen, /\A\(/], | |
[:rparen, /\A\)/], | |
[:term, /\A([a-zA-Z\d\*\?\:\._]+)/], | |
] | |
def lex(input) | |
tokens = [] | |
while not input.empty? | |
# Fetch token, push to array, slice input, repeat. | |
token = match_token(input) | |
tokens << token if not token[2] | |
input = input[token[1].length .. -1] | |
end | |
tokens << [:eof, '$'] | |
tokens | |
end | |
def match_token(input) | |
best_match = nil | |
TOKEN_LIST.each do |rule| | |
result = rule[1].match(input) | |
# Result must not be nil to be a better match | |
if result and (best_match.nil? or best_match[1].length < result.length) | |
best_match = [rule[0], result.to_s, rule[2]] | |
end | |
end | |
raise LexerError, "Could not match any token against \"#{input}\"." if best_match.nil? | |
best_match | |
end | |
end | |
class Parser | |
attr_accessor :parsed, :requires_query | |
def initialize(tokens, default_field, allowed_fields) | |
@tokens = tokens | |
@default_field = default_field | |
@allowed_fields = allowed_fields | |
@parsed = parse.build | |
end | |
def parse | |
advance | |
node = binary | |
expect(:eof) | |
return node | |
end | |
def binary | |
node = BinaryASTNode.new | |
node.left = unary | |
# <unary> AND <binary> | |
if accept(:and) or accept(:or) | |
node.operator = @last[0] | |
node.right = binary | |
return node | |
end | |
# Flatten tree | |
node.left | |
end | |
def unary | |
# NOT <unary> | |
if accept(:not) | |
node = UnaryASTNode.new | |
node.node = unary | |
return node | |
end | |
# Flatten tree | |
terminal | |
end | |
def terminal | |
if accept(:term) | |
# Join tags with spaces. | |
buffer = @last[1] | |
while accept(:term) | |
buffer += " #{@last[1]}" | |
end | |
# Parenthesized tags. | |
if accept(:lparen) | |
buffer += " (" | |
expect(:term) | |
buffer += @last[1] | |
while accept(:term) | |
buffer += " #{@last[1]}" | |
end | |
expect :rparen | |
buffer += ")" | |
end | |
node = TermASTNode.new(self, @default_field, @allowed_fields) | |
node.term = buffer | |
node | |
elsif accept(:lparen) | |
# ( <binary> ) | |
result = binary | |
expect(:rparen) | |
result | |
else | |
raise ParserError, "Unexpected token #{@current[0].inspect}!" | |
end | |
end | |
private | |
def advance | |
@last = @current | |
@current = @tokens.shift | |
end | |
def accept(type) | |
if @current and @current[0] == type | |
advance | |
true | |
else | |
false | |
end | |
end | |
def expect(type) | |
accept(type) or raise ParserError, "Expected #{type.inspect}, got #{@current.inspect}" | |
end | |
class BinaryASTNode | |
BINARY_TO_ES = {:and => :must, :or => :should} | |
attr_accessor :left, :right, :operator | |
def build | |
{:bool => {BINARY_TO_ES[operator] => [@left.build, @right.build]}} | |
end | |
end | |
class UnaryASTNode | |
# Since there is only one possible value a unary tree node could have, | |
# we don't even bother with assigning it an operator here. | |
attr_accessor :node | |
def build | |
{:not => @node.build} | |
end | |
end | |
class TermASTNode | |
attr_accessor :term | |
def initialize(parser, default_field, allowed_fields) | |
@parser = parser | |
@default_field = default_field | |
@allowed_fields = allowed_fields | |
end | |
def normalize_val(val, range=nil) | |
if %w[lt gt gte lte].include?(range) | |
return {range.to_sym => val} | |
else | |
return val | |
end | |
end | |
# Checks any terms with colons for whether a field is specified, and | |
# returns [field, value] | |
def escape_colons | |
args = nil | |
@term.match(/\A(.*?[^\\]):(.*)\z/) do |m| | |
field, val = m[1, 2] | |
field.downcase! | |
# Range query | |
if field =~ /(.*)\.([gl]te?)\z/ | |
range_field = $1.to_sym | |
return [range_field, normalize_val(val, $2)] | |
end | |
field = field.to_sym | |
if not @allowed_fields.include?(field) | |
return [@default_field, normalize_val("#{field}:#{val}")] | |
end | |
return [field, normalize_val(val)] | |
end | |
end | |
def build | |
wildcardable = !/\A"([^"]|\\")+"\z/.match(term) | |
if not wildcardable | |
@term = @term.slice(1, @term.size - 2) | |
end | |
field = nil | |
if @term.include?(':') | |
field, value = escape_colons | |
end | |
# No colon, or escape_colons encountered an unscaped colon | |
if not field | |
field, value = [@default_field, normalize_val(@term)] | |
end | |
# Range query | |
if value.is_a? Hash | |
return {range: {field => value}} | |
elsif wildcardable and (value =~ /(?:^|[^\\])[\*\?]/) | |
# '*' and '?' are wildcard characters in the right context | |
value.gsub!(/\\([\A\*\?])/, '\1') | |
@parser.requires_query = true | |
if value == '*' | |
# All-matching wildcard is just a match_all on the collection | |
return {match_all: {}} | |
end | |
return {wildcard: {field => value}} | |
else | |
normalize_term! value, (not wildcardable) | |
return {term: {field => value}} | |
end | |
end | |
def normalize_term!(match, quoted) | |
if quoted | |
match.gsub!('\"', '"') | |
else | |
match.gsub!(/\\(.)/, '\1') | |
end | |
end | |
end | |
end | |
class LexerError < StandardError; end | |
class ParserError < StandardError; end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment