Skip to content

Instantly share code, notes, and snippets.

@fowlmouth
Last active August 29, 2015 14:00
Show Gist options
  • Save fowlmouth/11284189 to your computer and use it in GitHub Desktop.
Save fowlmouth/11284189 to your computer and use it in GitHub Desktop.
Some kind of crazy parsing thing
import fowltek/maybe_t, strutils
type
TMatcher* = proc(input:string;start:var int): TMaybe[string]
Rule* = ref object
matcher*: TMatcher
proc newRule* (matcher: TMatcher): Rule =
Rule(matcher: matcher)
proc `&`* (a,b: Rule): Rule =
# match `a` followed by `b`
newRule(proc(input:string;start:var int): TMaybe[string] =
let ZZ = start
let x = a.matcher(input,start)
if x.has:
let y = b.matcher(input,start)
if y.has:
return just(x.val & y.val)
start = ZZ
)
proc `|`* (a,b: Rule): Rule =
# match `a` or `b`
newRule(proc(input:string; start:var int): TMaybe[string] =
let ZZ = start
let x = a.matcher(input,start)
if x.has:
return just(x.val)
start = ZZ
let y = b.matcher(input,start)
if y.has:
return just(y.val)
start = ZZ
)
proc str* (s:string): Rule =
# match a string
newRule(proc(input:string; start:var int): TMaybe[string] =
if input.continuesWith(s, start):
result = just(s)
inc start, len(s)
)
proc chr* (chrs: varargs[char]): Rule =
# match one of a set of characters
let chrs = @chrs
newRule(proc(input:string;start:var int): TMaybe[string] =
if input[start] in chrs:
result = just($ input[start])
inc start
)
proc chr* (chr: set[char]): Rule =
# match one of a set of characters
newRule(proc(input: string; start: var int): TMaybe[string] =
if input[start] in chr:
result = just($ input[start])
inc start
)
proc repeat* (R:Rule; min: int): Rule =
# Repeat a minimum `min` times
newRule(proc(input:string; start:var int): TMaybe[string] =
var i = 0
let ZZ = start
result.val = ""
while true:
let ret = r.matcher(input,start)
if not ret.has: break
result.val.add ret.val
inc i
if start > input.high: break
if i >= min:
result.has = true
else:
start = ZZ
)
template reset_return : stmt =
result.has = false
result.val = ""
return
proc repeat* (R:Rule; slc: TSlice[int]): Rule =
# Repeat a rule min..max times
newRule(proc (input:string; start:var int): TMaybe[string] =
let ZZ = start
var i = 0
result.val = ""
while i <= slc.b:
if start > input.high: break
if (let ret = r.matcher(input,start); ret.has):
result.val.add ret.val
else: break
inc i
if i >= slc.a:
result.has = true
else:
start = ZZ
)
proc `+`* (rule: Rule): Rule =
# match 1 or more
result = rule.repeat(1)
discard """ newRule(proc (input:string; start:var int): TMaybe[string] =
result.val = ""
while true:
let (has,str) = rule.matcher(input,start)
if has:
result.has = true
result.val.add str
if not has or start > input.high:
break
) """
proc many* (R:Rule): Rule {.inline.} =
# match 1 or more
+r
proc `?`* (R:Rule): Rule =
# consume 1 or 0 (always pass)
newRule(proc(input:string; start:var int):TMaybe[string]=
result = just(r.matcher(input,start).val)
if result.val.isNil: result.val = ""
)
proc maybe* (R:Rule): Rule {.inline.}=
# consume 1 or 0 (always pass)
?r
proc `*`* (R:Rule): Rule {.inline.} =
# consume 0 or many (always pass)
# this is just `?many(R)`
#?many(R)
repeat(R, 0)
proc present* (R:Rule): Rule =
# look ahead, returns true if the pattern matches but does
# not consume
newRule(proc(input:string; start:var int): TMaybe[string] =
let x = start
if r.matcher(input,start).has:
result = just("")
start = x
)
proc absent* (R:Rule): Rule =
# look ahead only, returns true if the pattern was NOT found
newRule(proc(input:string; start:var int): TMaybe[string] =
if not present(R).matcher(input,start).has:
result = just("")
)
proc `!`* (R: Rule): Rule {.inline.} =
absent(R)
proc tag (R:Rule; tag:string):Rule =
# TODO
# this will be used to store a match in the resulting tree
# the tree will probably be json because its easy to work with
Rule(
matcher:proc(input:string;start:var int):TMaybe[string] =
result = R.matcher(input,start)
if result.has:
#echo "tag discarded: ", tag, ": ", result.val
)
when defined(HelperRules) or isMainModule:
# Some helper rules
const
printableChars = {'\x20' .. '\x7E'}
alphaChars = {'A'..'Z', 'a'..'z'}
digits = {'0'..'9'}
identChars = {'A'..'Z','a'..'z','0'..'9','_'}
allChars = {'\x00' .. '\xFF'}
proc join* (r, on: Rule; min = 0; max = -1): Rule =
# Join a rule on another rule in the sequence
# `on & r` must repeat `min` times
# `max` may be -1
if max > -1: r & (on & r).repeat(min .. max)
else: r & (on & r).repeat(min)
proc comma_list* (r: Rule; min = 0; max = -1): Rule =
join(r, chr(','), min,max)
proc wrapped_rule (open,close:Rule): proc(inner:Rule): Rule =
# Creates a rule that matches like `open & inner & close`
return proc(inner:Rule): Rule =
when true:
let
open = open
close = close
Rule(
matcher: proc(input:string; start:var int): TMaybe[string] =
let x = start
if open.matcher(input,start):
result = inner.matcher(input,start)
if result.has:
if close.matcher(input,start):
return
#reset the result because the closing rule didnt match
result.has = false
start = x
)
let
braces* = wrapped_rule(chr('{'), chr('}'))
parens* = wrapped_rule(chr('('), chr(')'))
brackets* = wrapped_rule(chr('['), chr(']'))
proc add (L:var string; R:TMaybe[string]) =
if r.has: L.add r.val
discard """ proc comma_list* (R:Rule; postComma: Rule = nil): Rule =
newRule(proc(input:string; start:var int):TMaybe[string] =
result.val = ""
while(let (has,s) = r.matcher(input,start); has):
result.has = true
result.val.add s
if input[start] != ',':
break
result.val.add ','
inc start
if not PostComma.isNil:
result.val.add postComma.matcher(input,start)
)
"""
proc keyword* (s: string): Rule =
str(s) & absent(chr(identChars))
proc stri* (s: string): Rule =
# case insensitive str
# probably more efficient to use a regex rule here
template accum (x): stmt =
if result.isNil:
result = x
else:
result = result & x
for character in s.items:
if character in alphaChars:
accum chr({character.toLower, character.toUpper})
else:
accum chr(character)
proc newRule*(): Rule = Rule()
proc `:=`* (a,b:Rule) =
# update rule a, set its matcher to rule b
# you can use this to refer to rules before
# they're initialized.
a.matcher = b.matcher
import macros
macro grammar* (name:expr; body:stmt):stmt {.immediate.} =
# accepts a list of statements like
#
# hello := "Hello"
# digits <- many(chr( {'0' .. '9'} ))
#
# you can refer to a rule here before it is defined
#
assert body.kind == nnkStmtList
result = newStmtList()
let varDecl = newNimNode(nnkVarSection)
result.add varDecl
for i in 0 .. < len(body):
let s = body[i]
if s.kind == nnkInfix and $(s[0]) in [":=","<-"]:
varDecl.add newIdentDefs(s[1], newEmptyNode(), newCall("newRule"))
result.add s[1].infix(":=", s[2])
elif s.kind == nnkAsgn:
varDecl.add newIdentDefs(s[0], newEmptyNode(), newCall("newRule"))
result.add s[0].infix(":=", s[1])
else:
result.add s
when defined(Debug):
echo repr(result)
proc match* (R:Rule; S:String): TMaybe[string] =
var i = 0
return r.matcher(s,i)
when isMainModule:
const
spaceChars = {' ','\t'}
grammar(my_grammar):
#so meta
program =
+(space_newlines | rule_decl) & ?space_newlines
rule_decl =
name & ?space &
(str("=") | str(":=") | str("<-")) &
?space & mainRule
nl = chr({'\L'})
space =
+chr(spaceChars)
space_newlines =
+(
space | nl |
(chr('#') & +(absent(chr({'\x00','\L'})) & chr(allChars)))
)
name = chr({'A'..'Z','a'..'z','_'}) & *chr({'A'..'Z','a'..'z','_','0'..'9'})
mainRule = join(prefixRule, ?space & bin_op & ?space_newlines, 0)
prefixChar = chr({'?','!','+','*'})
bin_op = chr({'&','|'})
prefixRule = ?(prefixChar & *(space & prefixChar)) & ?space & baseRule
func_chr =
keyword("chr") &
parens(
(?space & nim_char_set & ?space) |
comma_list(?space & nim_char_lit & ?space)
)
baseRule =
( keyword("str") & parens(?space & str_lit & ?space) ) |
func_chr |
( (keyword("comma") )) |
( parens(mainRule) )
str_lit =
chr('\"') &
(repeat(
( chr('\\')& (
chr('"') |
(chr('x') & repeat(chr({'A'..'F','a'..'f','0'..'9'}), 2..2))
)) |
chr(allChars-{'"'}), 0
)) &
chr('\"')
nim_char_lit =
chr('\'') &
(
(chr('\\') &
( chr({'\'','L'}) |
chr('x') & repeat(chr({'A'..'F','a'..'f','0'..'9'}), 2..2)
)
) |
chr(allChars-{'\''})
) &
chr('\'')
nim_char_set =
braces(comma_list(?space & nim_char_lit & ?(?space & str("..") & ?space & nim_char_lit), 0))
comma = chr(',')
let r = program #(?space & mainrule & ?space)
echo r.match("""rule1 = ?chr('x')""")
when defined(repl):
import rdstdin,tables
block:
const helptext = """
Commands:
def-rules
show-rules
"""
echo helptext
var rules = initTable[string,Rule]()
var input = ""
var line: string
while readlineFromStdin("> ", line):
if (line.len == 0 and input.len > 0) or keyword("end").match(line):
let m = program.match(input)
echo m
input.setLen 0
else:
input.add line
input.add '\L'
#quit 1
# minitest thing
template test (name:expr; body:stmt): stmt {.immediate.} =
block thisBlock:
template check (cond): stmt =
if not cond:
echo name, " [Failed] `", astToStr(cond), '`'
break thisBlock
body
echo name, " [Passed]"
template testAll (name:expr; body:stmt): stmt{.immediate.}=
# like test but checks can fail
block fowl_is_cool:
template check(cond): stmt =
when not definedInScope(failed):
var failed{.inject.} = newSeq[string](0)
if not cond:
failed.add astToStr(cond)
body
if failed.len > 0:
if failed.len == 1:
echo name, " [", failed.len, " Failed] `", failed[0], '`'
else:
echo name, " [", failed.len, " Failed]"
for expression in failed:
echo " `", expression, '`'
failed.setLen 0
else:
echo name, " [Passed]"
block:
const digit = {'0'..'9'}
grammar(sendy):
ws_soft = +chr(' ','\t')
ws = +chr(' ','\t','\L')
anyChar = chr({'\x00'..'\xFF'})
operator = +chr({'!','@','~','$','%','^','&','*','-','+','\\','/','<','>','?','.'})
assignment_op = +operator & chr('=')
literal = str_lit | float_lit | int_lit
float_lit = ?chr({'-','+'}) & +chr(digit) & chr('.') & +chr(digit)
int_lit = ?chr({'-','+'}) & +chr(digit)
str_lit =
chr('"') &
*('"'.chr.absent & anyChar) &
chr('"')
ident = chr({'A'..'Z','a'..'z','_'}) & *chr({'A'..'Z','a'..'z','_','-','0'..'9'})
message =
ident &
?parens(comma_list(?ws & expression & ?ws, 0) | ?ws)
base_expr = literal | parens(?ws & expression & ?ws)
postfix_expr =
(base_expr & *(ws_soft & message)) |
message.join(ws_soft)
prefix_expr = ?(operator & ?ws_soft) & postfix_expr
binary_expr = prefix_expr & ?many(?ws_soft & operator & ?ws & prefix_expr)
expression = binary_expr
testall "Idents":
check ident.match("x")
check ident.match("print-line")
testall "expressions":
check expression.match "1 + 2"
check expression.match "-1 abs"
check expression.match("x foo(42)")
check expression.match("""stdout print-line("Hello")""")
check expression.match("actor message(foo, 1) linked-message(1.2)")
quit 0
# example usage for a weird js-like language
# there is a test suite below
block:
grammar(js_like):
ws <-
many(chr({' ', '\t', '\L'}))
ws_soft <-
many(chr({' ', '\t'}))
ident <-
chr({'A'..'Z', 'a'..'z', '_'}) & *chr(identChars)
int_lit <-
+chr(digits)
str_lit <-
chr('"') &
*(absent(chr('"')) & chr({char.low..char.high})) &
chr('"')
operator <- +chr({'!','@','~','$','%','^','&','*','-','+','\\','/','<','>','?','='})
base_expr <-
int_lit | ident | str_lit | parens(?ws & expression & ?ws)
postfix_expr <-
base_expr &
?many(
(
chr('.') &
ident
) |
(
parens(?ws & ?(commaList(?ws & expression & ?ws_soft,0) & ?ws))
) |
(
brackets(?ws & commaList(?ws & expression & ?ws_soft, 0).maybe)
)
)
prefix_expr <-
?(operator & ?ws) & postfix_expr
binary_expr <-
prefix_expr & ?many(?ws_soft & operator & ?ws & prefix_expr)
expression <-
binary_expr
braces_statements <-
braces(?ws & statement_list & ?ws)
statements_or_single <-
(?ws & braces_statements) |
(?ws & statement)
assignment <-
ident & ?ws_soft &
chr('=') & ?ws &
expression
if_statement <-
(
keyword("if") & ?ws & expression &
statements_or_single
) &
*(
?ws & keyword("elseif") & ws & expression &
statements_or_single
) &
?(
?ws & keyword("else") &
statements_or_single
)
while_statement <-
keyword("while") & ws & expression & statements_or_single
return_statement <-
keyword("return") & ws & expression
statement <-
return_statement |
if_statement |
while_statement |
assignment |
expression
statement_list <-
join(statement, ?ws_soft & chr({'\L',';'}) & ?ws)
func_def <-
keyword("fn") & ws &
ident.tag("name") & ?ws &
parens(comma_list(?ws & ident & ?ws) | ?ws) & ?ws &
braces(?ws & statement_list & ?ws)
toplevel_stmt <-
func_def
program <-
?ws & join(toplevel_stmt, ?ws, 0) & ?ws
proc `=~` (S:String; L:Rule): bool =
var i = 0
result = l.matcher(s, i)
if result and i < s.len:
result = false
testAll "stri()":
check stri("FoO").match("foo")
check stri("FoO").match("FOO")
testAll "Numbers":
check match(int_lit, "1")
check match(int_lit, "6969")
check match(expression, "911")
test "Strings":
check "\"\"" =~ str_lit
check "\"hi\"" =~ str_lit
check(not("\"hi" =~ str_lit))
test "Identifiers":
check match(ident, "u")
check match(ident, "x_y")
check(not match(ident, "3e"))
testAll "Whitespace":
check match(?ws, " ")
check match(?ws, "")
check match(?ws_soft, " ")
check match(?ws_soft, "")
check(not match(ws_soft, "\L"))
testAll "Assignment":
check "y=2" =~ assignment
check "x =\L 4" =~ assignment
testAll "Expressions":
check "42" =~ expression
check "x + 1" =~ expression
check "1 + (2-3)" =~ expression
check "true" =~ expression
check "hello(1)" =~ expression
check "sup()" =~ expression
testAll "Statements":
check "if x {\L 42\L }" =~ statement
check "if true {1} elseif false {0}" =~ statement
check "if true {1} elseif false {0} else {3}" =~ statement
check "42+1" =~ statements_or_single
check "while false {print(\"Hello!\"); break}" =~ statement
testAll "Function defintion":
check("fn no_args() { return 1 }\L" =~ func_def & ?ws)
check("fn two_args(a,b) {1+2}" =~ func_def)
testAll "Program":
check("""fn main() {
print("Hello")
}
fn fib(x) {
if x < 2 { return x }
return fib(x-1)+fib(x-2)
}
""" =~ program)
import
json,strutils,
fowltek.maybe_t
type
TInput = object
len, pos: int
str: string
TMatchKind* = enum
mUnrefined, mJson
TPositiveMatch = object
case kind*: TMatchKind
of mUnrefined: str*:string
of mJSON: j*: PJsonNode
TMatchResult = TMaybe[TPositiveMatch]
TMatcher* = proc(input: var TInput): TMatchResult
Rule* = ref object
matcher: TMatcher
proc newRule (m:TMatcher): Rule =
Rule(matcher:m)
proc `$`* (R: TPositiveMatch): string =
case r.kind
of mUnrefined:
result = r.str
of mJson:
result = $r.j
proc match* (R: Rule; str: string): TMatchResult =
var input = TInput(len: str.len, str: str, pos: 0)
result = r.matcher(input)
template matchf(body:stmt):expr {.immediate.} =
(proc(input: var TInput): TMatchResult =
body)
proc mk_unref (s:string): TMatchResult =
just(TPositiveMatch(kind: mUnrefined, str: s))
proc mk_j (j:PJsonNode): TMatchResult =
just(TPositiveMatch(kind: mJson, j: j))
proc isArray (r:TMatchResult): bool =
r.has and r.val.kind == mJson and r.val.j.kind == jArray
proc high* (i: TInput): int = i.str.high
let matchFail = Nothing[TPositiveMatch]()
template any* (iter, name, cond: expr): expr {.immediate.}=
var res{.gensym.} = false
for name in iter:
if cond:
res = true
break
res
proc merge* (J1, J2: PJsonNode) =
if j1.kind == jObject and j2.kind == jObject:
for key,val in items(j2.fields):
j1[key] = val
template testFeature (name;body:stmt):stmt{.immediate.}=
block:
when not defined(failed_tests):
var failed_tests{.inject.}: seq[string] = @[]
template check (xpr): stmt =
discard """ when not defined(failed_tests):
var failed_tests{.inject.}: seq[string] = @[] """
if not xpr:
failed_tests.add astToStr(xpr)
body
if failed_tests.len > 0:
echo name, " [", failed_tests.len, " Failures]"
for f in failed_tests:
echo " ", f
failed_tests.setLen 0
else:
echo name, " [Passed]"
proc accumJson (results:varargs[TPositiveMatch]): TMatchResult =
# discard any unrefineds
var res = newSeq[TPositiveMatch](results.len)
res.setLen 0
for it in results:
if it.kind == mUnrefined: continue
res.add it
assert res.len > 0
if res.len == 1:
result = just(res[0])
return
# try to merge json objects
var i = 0
when defined(Debug):
echo "res.len: ", res.len
var iters = 0
while i < high(res):
echo iters, " (", i,")"
if iters > 5: break
inc iters
if res[i].j.kind == jObject and res[i+1].j.kind == jObject:
type TKV = tuple[key:string,val:PJsonNode]
proc keys (J: PJsonNode): seq[string] =
j.fields.map(proc(kv:TKV):string = kv.key)
let
r1 = res[i].j
r2 = res[i+1].j
r2_keys = r2.keys
when defined(Debug):
echo "Comparing $# and $#".format(r1,r2)
proc filter (item:string): bool =
result = item in r2_keys
if result and defined(Debug):
echo "Clash: ", item
if any(r1.keys, it, filter(it)):
when defined(debug):
echo "Cannot join them."
inc i, 1
continue
# merge and delete r2
r1.merge r2
res.delete i+1
inc i, 1
else:
inc i
if res.len == 1:
return just(res[0])
else:
# multiple results, return it as a jarray
var arr = newJarray()
for it in res:
arr.add it.j
result = mk_j(arr)
proc currentChar* (input:TInput): char = input.str[input.pos]
proc chrMatcher (chars: seq[char]|set[char]): TMatcher =
return (matchf do:
if input.currentChar in chars:
result = mk_unref($ input.currentChar)
input.pos.inc
)
proc chr* (chars: varargs[char]): Rule =
newRule(chrMatcher(@chars) )
proc chr* (chars: set[char]): Rule =
newRule(chrMatcher(chars))
proc tag* (R:Rule; name:string): Rule =
newRule(
(matchf do:
result = r.matcher(input)
if result.has:
if result.val.kind == mUnrefined:
result = mk_j(%{ name: %result.val.str })
else:
result = mk_j(%{ name: result.val.j })
))
testFeature "tag()":
check((let r = chr('A').tag("x").match("A"); r.has and r.val.kind == mjson and $r.val.j == "{\"x\": \"A\"}"))
proc accumulate (results: varargs[TPositiveMatch]): TMatchResult =
if results.len == 1: return just(results[0])
if results.any(it, it.kind == mJson):
result = results.accumJson
else:
# all unrefineds. join them.
result = just(TPositiveMatch(kind: mUnrefined, str: ""))
for it in results:
result.val.str.add it.str
proc `&` * (A,B: Rule): Rule =
newRule(matchf do:
let zz = input.pos
let ma = a.matcher(input)
if ma.has:
let mb = b.matcher(input)
if mb.has:
result = accumulate(ma.val, mb.val)
return
input.pos = zz
)
testFeature "& sequence":
let rule = chr('A') & chr('x')
check rule.match("Ax")
check(not rule.match("xlkj"))
let match_rule = rule.tag("match")
check match_rule.match("Ax").val.j["match"].str == "Ax"
let test5 = (chr('a') & chr('b') & chr('c') & chr('d')).tag("match")
check test5.match("abcd").val.j["match"].str == "abcd"
proc `|`* (A,B: Rule): Rule =
newRule(matchf do:
let zz = input.pos
if (let (has,m) = a.matcher(input); has):
return just(m)
input.pos = zz
if (let (has,m) = b.matcher(input); has):
return just(m)
input.pos = zz
)
testFeature "OR sequence":
let rule = chr('a') | chr('b')
check rule.match("a")
check rule.match("b")
check(not rule.match("c"))
proc repeat* (R:Rule; min,max:int): Rule =
newRule(matchf do:
var i = 0
let zz = input.pos
var results: seq[TPositiveMatch] = @[]
while i < max and input.pos < input.len:
if (let (has,res) = r.matcher(input); has):
results.add res
inc i
else:
break
if i < min:
input.pos = zz
result = match_fail
else:
if i > 0:
result = accumulate(results)
else:
result = mk_unref("")
)
proc repeat* (R:Rule; min:int): Rule =
newRule(matchf do:
var i = 0
let zz = input.pos
var results: seq[TPositiveMatch] = @[]
while input.pos < input.len:
if (let (has,res) = r.matcher(input); has):
results.add res
inc i
else:
break
if i < min:
input.pos = zz
result = match_fail
else:
if i > 0:
result = accumulate(results)
else:
result = mk_unref("")
)
testFeature "Repeat":
#echo($chr('x').repeat(1).match("xx"))
#check(not chr('x').repeat(1).match("x").has)
check($chr('x').repeat(1).match("xx") == "xx")
# repetitions are tagged together
let rule = chr('x').repeat(1).tag("x")
echo rule.match("xx")
check($ chr('x').repeat(1).tag("x").match("xx").val.j == """{"x": "xx"}""")
# repetitions are merged as an array
block:
#echo chr('x').tag("x").match("xx").val.j
check($chr('x').tag("x").repeat(1).match("xx").val.j == """[{"x": "x"}, {"x": "x"}]""")
proc `+`* (R:Rule): Rule =
repeat(R, 1)
proc `*`* (R:Rule): Rule =
repeat(R, 0)
proc `?`* (R:Rule): Rule =
repeat(R, 0,1)
block:
const digits = {'0'..'9'}
let
int_lit = chr(digits).repeat(1).tag("x")
space = +chr(' ','\t')
expression = int_lit & *(space & int_lit)
block:
let x = expression.match("1 2")
assert x.has and $x.val.j == """[{"x": "1"}, {"x": "2"}]"""
#echo expression.match("1 2 3")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment