Last active
January 5, 2016 00:35
-
-
Save merlight/5fc60cc56cdc87fa1c73 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- debug utils | |
local df = df or function(...) print(string.format(...)) end | |
local function dstring(arg) | |
if type(arg) == "string" then | |
return string.format("%q", arg) | |
else | |
return tostring(arg) | |
end | |
end | |
---------------------------- | |
-- BEGIN GENERIC PARSER LIB | |
local strfind = string.find | |
local strformat = string.format | |
local strsub = string.sub | |
local strgsub = string.gsub | |
local pop = table.remove | |
local push = table.insert | |
local operator_function = | |
{ | |
__call = function(self, ...) | |
local op, lhs, rhs = self[1], self[2], self[3] | |
return op(lhs, rhs, ...) | |
end | |
} | |
local function flush_ast(have_args, arg_stack, op_stack, op_next) | |
for i = #op_stack, 1, -1 do | |
local op_last = op_stack[i] | |
if type(op_last) == "number" then -- reached '(' | |
break | |
end | |
if op_next then | |
local prec = op_last.precedence - op_next.precedence | |
if prec >= 0 and (prec > 0 or op_next.assoc == 'right') then | |
break | |
end | |
end | |
df("# flush operator '%s'", op_last.name) | |
if op_last.unary_func then | |
if have_args < 1 then | |
error(strformat("missing argument for operator '%s'", op_last.name)) | |
end | |
local n = #arg_stack | |
local op = {op_last.unary_func, nil, arg_stack[n]} | |
arg_stack[n] = setmetatable(op, operator_function) | |
elseif op_last.binary_func then | |
if have_args < 2 then | |
error(strformat("missing argument for operator '%s'", op_last.name)) | |
end | |
local n = #arg_stack | |
local op = {op_last.binary_func, arg_stack[n-1], arg_stack[n]} | |
arg_stack[n-1] = setmetatable(op, operator_function) | |
arg_stack[n] = nil | |
have_args = have_args - 1 | |
else | |
error(strformat("what is operator '%s' supposed to do?", op_last.name)) | |
end | |
op_stack[i] = nil | |
end | |
return have_args | |
end | |
function build_ast(tokenizer, str, pos) | |
local arg_stack = {} | |
local op_stack = {} | |
local have_args = 0 | |
while pos do | |
local npos, op_next, token = tokenizer(str, pos) | |
if not npos then | |
error(strformat("malformed expression near '%s'", strsub(str, pos + 1))) | |
else | |
pos = npos | |
npos = pos < 0 and -pos or pos -- need actual position for error messages | |
end | |
if type(op_next) == "table" then | |
df("- pos=%3d op=%s token=%s", pos, dstring(op_next.name), dstring(token)) | |
else | |
df("- pos=%3d op=%s token=%s", pos, dstring(op_next), dstring(token)) | |
end | |
if op_next then | |
if op_next == '(' then | |
push(op_stack, have_args) | |
have_args = 0 | |
elseif op_next == ')' then | |
have_args = flush_ast(have_args, arg_stack, op_stack) | |
local had_args = pop(op_stack) | |
if type(had_args) ~= "number" then | |
error(strformat("unmatched closing parenthesis at '%s'", strsub(str, npos))) | |
end | |
assert(have_args <= 1) | |
have_args = have_args + had_args | |
elseif op_next == '$' then -- end of expression | |
have_args = flush_ast(have_args, arg_stack, op_stack) | |
break | |
else | |
have_args = flush_ast(have_args, arg_stack, op_stack, op_next) | |
push(op_stack, op_next) | |
end | |
end | |
if token then | |
push(arg_stack, token) | |
have_args = have_args + 1 | |
end | |
end | |
if #op_stack > 0 then | |
error(strformat("unmatched opening parenthesis in '%s'", str)) | |
elseif #arg_stack > 1 then | |
error(strformat("malformed expression '%s'", str)) | |
end | |
return unpack(arg_stack) | |
end | |
-- END OF GENERIC PARSER LIB | |
----------------------------- | |
--------------------------------- | |
-- THIS PART IS USING THE PARSER | |
-- helper function to evaluate an AST node: | |
-- if it's a string, it's a pattern to be matched, | |
-- otherwise it's a compound node to be called | |
local function match_end(node, str, pos) | |
if type(node) == "string" then | |
local ms, me = strfind(str, node, pos + 1) | |
return me | |
else | |
return node(str, pos) | |
end | |
end | |
-- define operator functions | |
-- they all receive the left- and right-hand-side AST nodes as their | |
-- first two arguments (unary operators receive lhs==nil) | |
-- further arguments are arbitrary context passed from user code | |
local function match_negation(lhs, rhs, str, pos) | |
local rpos = match_end(rhs, str, 0) | |
if rpos then | |
return false | |
else | |
return pos | |
end | |
end | |
function match_alternation(lhs, rhs, str, pos) | |
local lpos = match_end(lhs, str, pos) | |
local rpos = match_end(rhs, str, pos) | |
if lpos and rpos then | |
return lpos < rpos and lpos or rpos | |
else | |
return lpos or rpos | |
end | |
end | |
function match_concatenation(lhs, rhs, str, pos) | |
local lpos = match_end(lhs, str, pos) | |
if lpos then | |
return match_end(rhs, str, lpos) | |
else | |
return false | |
end | |
end | |
local operators = | |
{ | |
['!'] = {name='!', assoc='right', precedence=200, unary_func=match_negation}, | |
[' '] = {name='.', assoc='left', precedence=300, binary_func=match_concatenation}, | |
['+'] = {name='+', assoc='left', precedence=500, binary_func=match_alternation}, | |
} | |
-- define tokenizer; | |
-- it works like an iterator function, except that a nil return means syntax error; | |
-- for well-formed expression, it returns '$' marker at the end | |
function nexttoken(str, pos) | |
local op_concat = nil | |
if pos < 0 then -- the previous token was a string or ')' | |
pos = -pos | |
-- if the next token is a string, prepend it | |
-- with a synthesized concatenation operator | |
op_concat = operators[' '] | |
-- if the next token is '(' or a unary operator, | |
-- synthesize a concatenation operator first | |
if strfind(str, '^%s*[(!]', pos + 1) then | |
return pos, op_concat, nil | |
end | |
end | |
-- match everything up to operator | |
local ms, me, m1 = strfind(str, '^%s*([^"()+!^~]+)', pos + 1) | |
if m1 then | |
-- escape metacharacters | |
local pat = strgsub(m1, "[-*+?^$().[%]%%]", "%%%0") | |
-- replace whitespace with "match shortest anything" | |
pat = strgsub(pat, "%s+", ".-") | |
return -me, op_concat, pat | |
end | |
-- match quoted string | |
local ms, me, m1 = strfind(str, '^%s*"(.-)"', pos + 1) | |
if m1 then | |
-- escape metacharacters | |
local pat = strgsub(m1, "[-*+?^$().[%]%%]", "%%%0") | |
return -me, op_concat, pat | |
end | |
-- match operator or empty string at end of expression | |
local ms, me, m1 = strfind(str, '^%s*(.?)', pos + 1) | |
if m1 == ')' then | |
-- return negative position to tell the next round | |
-- there was a value preceding it | |
return -me, m1, nil | |
elseif m1 == '(' then | |
return me, m1, nil | |
elseif m1 == '' then | |
return me, '$', nil -- end of expression marker | |
else | |
return me, assert(operators[m1]), nil | |
end | |
end | |
function eval_ast(node, str) | |
return match_end(node, str, 0) | |
end | |
-- and this is how you put it together | |
local some_strings = {'book chapter one', 'one book chapter', 'book chapter two', 'book chapter three', 'book chapter twenty one'} | |
local filter_expr = 'chapter (one + two) !twenty' | |
df("\nPARSE TEST : %s\n", filter_expr) | |
local ok, ast = pcall(build_ast, nexttoken, filter_expr, 0) | |
if not ok then | |
error(ast) | |
end | |
df("\nMATCH TEST\n") | |
for i, str in ipairs(some_strings) do | |
local ok, match = pcall(match_end, ast, str, 0) | |
if not ok then | |
df("! error : %q : %q", match, str) | |
elseif match then | |
df("+ match : %q", str) | |
else | |
df("- no match : %q", str) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment