Skip to content

Instantly share code, notes, and snippets.

@pgoodman
Created December 31, 2008 18:46
Show Gist options
  • Save pgoodman/42080 to your computer and use it in GitHub Desktop.
Save pgoodman/42080 to your computer and use it in GitHub Desktop.
;;;; Recursive Descent Parser Generator
;;;; Copyright 2008 Peter Goodman, all rights reserved
(defun concatenate* (type &rest lst)
"Concatenate all cars within a tree."
(setf lst (remove nil lst))
(if (null lst)
"" ; nothing to concatenate, return
(let ((a (car lst)) (d (cdr lst)))
(if (listp a)
(setf a (concatenate* type (car a) (cdr a))))
(if (not (typep a type))
(error "Invalid type passed to concatenate*."))
(concatenate type a (concatenate* type d)))))
(defmacro as-list (lst)
"Return a list; if the argument isn't a list then box it in a list."
`(if (listp ,lst)
,lst
(list ,lst)))
(defmacro push-to-end (obj lst)
"Push obj onto the end of lst."
`(setf ,lst (append ,lst (as-list ,obj))))
(defun map-dfs (tree &key (list-fn #'identity) (elm-fn #'identity))
"Go as deep into the tree and apply the function to any lists, then gradually
bubble up to the top by applying the function to lists."
(declare (type list tree)
(type function list-fn elm-fn))
(if (null tree)
(return-from map-dfs nil))
(funcall list-fn (mapcar #'(lambda (elm)
(if (listp elm)
(map-dfs elm :list-fn list-fn :elm-fn elm-fn)
(funcall elm-fn elm)))
tree)))
(defparameter *parser-ops* (make-hash-table :size 14)
"A hashtable of program defined parser operators.")
(defparameter *parser-fncs* (make-hash-table)
"A hashtable of user-defined parser functions.")
(defparameter *intermediate-parser-fncs* (make-hash-table :size 20)
"Intermediate parser functions.")
(defstruct parser-state
"The current state that the parser is in."
(buffer "" :type string)
(index 0 :type fixnum)
(matches nil :type list))
(defmacro return-values-from (block-id &key (sub-matches nil) (matched t) (consumed t))
"Abbreviation of 'return-from' and 'values'."
`(return-from ,block-id (values ,sub-matches ,matched , consumed)))
(defmacro return-failed-match (block-id)
"Abbreviation for a failed match."
`(return-from ,block-id (values (list nil) nil nil)))
(defmacro $state-buffer ()
`(parser-state-buffer $state))
(defmacro $state-index ()
`(parser-state-index $state))
(defmacro $state-matches ()
`(parser-state-matches $state))
(defmacro parser-error (type str)
(let ((str-type (case type
(parse "Parse Error")
(expr "Expression Error")
(otherwise "Unknown Error"))))
`(error (format nil "~A: ~A" ,str-type ,str))))
(defun parser-make-string-matcher (str)
"Make the part of a function to match the front of the text in the buffer
against a string."
(declare (type string str))
(let* ((str-len (length str))
(block-id (gensym))
(start-index (gensym))
(bool-consumed (> str-len 0)))
`(block ,block-id
(let ((,start-index ($state-index)))
(if (<= ,str-len $len)
(if (string= ,str ($state-buffer)
:start1 0
:start2 ,start-index
:end1 ,str-len
:end2 (+ ,str-len ($state-index)))
(progn
(incf ($state-index) ,str-len)
(return-values-from ,block-id
:sub-matches (subseq ($state-buffer) ,start-index (+ ,start-index ,str-len))
:matched t
:consumed ,bool-consumed)))))
(return-failed-match ,block-id))))
(defun parser-make-find-next (ops)
"Find everything up to the next instance of a string, or repeatedly test the parser
state buffer against a certain sub-parser function."
(declare (type list ops))
(if (or (= 0 (length ops)) (> (length ops) 1))
(parser-error expr "find-next expects one and only one operand."))
(let ((op (car ops)))
(if (null op)
(parser-error expr "find-next expects one and only one parameter of type expression or string."))
(let ((block-id (gensym)))
(cond
;; continually test a function, without returning the results of that function
((symbolp op)
(let ((test-fn (gensym))
(curr-index (gensym))
(start-index (gensym)))
`(block ,block-id
(let ((,test-fn #'(lambda nil ,op)) ; encapsulate the block of code into a test function
(,start-index ($state-index)))
(do ((,curr-index ,start-index (+ ,curr-index 1)))
((> ,curr-index $len) (return-failed-match ,block-id))
(multiple-value-bind
(sub-matches matched? consumed?)
(funcall ,test-fn)
(if (and matched? consumed?)
;; we've matched something *and* consumed part of the buffer. note:
;; operators such as 'maybe' will match but won't consume.
;;
;; at this point any changes to the state index done by the sub-parser
;; function need to be undone so that we can return the proper matched
;; text
(progn
(setf ($state-index) ,curr-index)
(return-values-from ,block-id
:sub-matches (subseq ($state-buffer) ,start-index (+ ,curr-index 1))
:matched t
:consumed (/= ,start-index ,curr-index))))))))))
;; find the first occurence of a string
((or (stringp op) (characterp op))
(setf op (string op))
(let* ((str-len (length op))
(pos (gensym))
(start-index (gensym))
(bool-consumed (> str-len 0)))
`(block ,block-id
(let ((,start-index ($state-index))
(,pos (search ,op ($state-buffer)
:test #'string=
:start2 ($state-index)
:end1 ,str-len
:end2 $len)))
(if (numberp ,pos)
;; matched the string
(progn
(setf ($state-index) ,pos)
(return-values-from ,block-id
:sub-matches (subseq ($state-buffer) ,start-index ,pos)
:matched t
:consumed (and (> ,pos ,start-index) ,bool-consumed)))
;; no match
(return-failed-match ,block-id))))))
(t (parser-error expr "Unrecognized argument passed to find-next."))))))
(defun parser-make-lambda-list (ops)
"Given a list of tokenized operators, parse out the remaining string literals as string
matchers and then encapsulate each sub-parser in a lambda."
(declare (type list ops))
(parse-strings-as-fns ops)
(let ((lst nil))
(loop for op in ops
do (push-to-end (list `#'(lambda nil ,op)) lst))
(push 'list lst)))
(defun parser-make-or (fns)
"Return the match results of whichever parser sub-functions succeeds to match."
(declare (type list fns))
(if (< (length fns) 2)
(parser-error expr "or expects at least two operands."))
(let ((fn-lst (gensym))
(fn (gensym))
(block-id (gensym)))
`(block ,block-id
(let ((,fn-lst ,(parser-make-lambda-list fns)))
(loop for ,fn in ,fn-lst
do (multiple-value-bind
(sub-matches matched? consumed?)
(funcall ,fn)
(if matched?
(return-values-from ,block-id
:sub-matches sub-matches
:matched t
:consumed consumed?))))
(return-failed-match ,block-id)))))
(defun parser-make-and (fns)
"Collect the matches of any sub-operations. If a single sub-operation fails then the entire operation
necessarily fails and no changes to buffer index are kept."
(declare (type list fns))
(if (< (length fns) 2)
(parser-error expr "and expects at least two operands."))
(let ((fn-lst (gensym))
(fn (gensym))
(start-index (gensym))
(bool-consumed (gensym))
(matches (gensym))
(block-id (gensym)))
`(block ,block-id
(let ((,fn-lst ,(parser-make-lambda-list fns))
(,bool-consumed nil)
(,matches nil)
(,start-index ($state-index)))
(setf ,matches (loop for ,fn in ,fn-lst
collect (multiple-value-bind
(sub-matches matched? consumed?)
(funcall ,fn)
(if consumed?
(setf ,bool-consumed t))
(if (not matched?)
(progn
(setf ($state-index) ,start-index)
(return-failed-match ,block-id))
sub-matches))))
(return-values-from ,block-id
:sub-matches ,matches
:matched t
:consumed ,bool-consumed)))))
(defun parser-make-maybe (fns)
"Make the maybe operator. The maybe operator is defined in terms of an OR with an empty string as
the last function."
(parser-make-or (push-to-end "" fns)))
(defun parser-make-pass (&rest ops)
"The pass operator, it is defined in terms of an or of two empty strings."
(parser-make-or (list "" "")))
(defun parser-make-group (ops)
"Collect the matches of any sub-operations and then concatenate them into one single match string."
(declare (type list ops))
(let ((fn-list (gensym))
(block-id (gensym))
(fn (gensym))
(bool-consumed (gensym))
(match (gensym)))
`(block ,block-id
(let* ((,fn-list ,(parser-make-lambda-list ops))
(,bool-consumed nil)
(sub-matches (loop for ,fn in ,fn-list
collect (multiple-value-bind
(sub-matches matched? consumed?)
(funcall ,fn)
(if consumed?
(setf ,bool-consumed t))
sub-matches))))
(return-values-from ,block-id
:sub-matches (list sub-matches)
:matched t
:consumed ,bool-consumed)))))
(defun parser-make-concat (ops)
"Collect the matches of any sub-operations and then concatenate them into one single match string."
(declare (type list ops))
(let ((block-id (gensym)))
`(block ,block-id
(multiple-value-bind
(sub-matches matched? consumed?)
,(parser-make-group ops)
(return-from ,block-id (values (concatenate* 'string sub-matches) matched? consumed?))))))
(defun parser-make-no-capture (ops)
"Do everything but return matches."
(declare (type list ops))
(let ((block-id (gensym)))
`(block ,block-id
(multiple-value-bind
(sub-matches matched? consumed?)
,(parser-make-group ops)
(return-from ,block-id (values nil matched? consumed?))))))
(defun parser-make-if (ops)
"Create a function to conditionally match sequences. Note: the matches found from the
conditions of an if statement are not recorded."
(declare (type list ops))
(if (or (< (length ops) 2) (> (length ops) 3))
(parser-error expr "if takes between 2 and 3 operands."))
(if (= (length ops) 2)
(push-to-end "" ops))
(let ((block-id (gensym))
(fns (gensym)))
`(block ,block-id
(let ((,fns ,(parser-make-lambda-list ops)))
(multiple-value-bind
(sub-matches matched? consumed?)
(funcall (first ,fns))
(if matched?
(funcall (second ,fns))
(funcall (third ,fns))))))))
(defun make-parser-make-repeat (lst)
(declare (type list lst))
(print lst)
(if (< (length lst) 3)
(parser-error expr "repeat expects at least three parameters."))
(parser-make-repeat
(cddr lst)
:min (abs (coerce (first lst) 'fixnum))
:max (if (eql (second lst) 'nil)
nil
(abs (coerce (second lst) 'fixnum)))))
(defun parser-make-repeat (ops &key min max)
(declare (type list ops))
(let ((block-id (gensym))
(lower-bound (gensym))
(fns (gensym)))
`(block ,block-id
(let ((,fns ,(parser-make-lambda-list ops))
(,lower-bound 0))
))))
(defun make-parser-replace-symbols (sym-sm sym-cm? sym-c?)
"Replace various symbols deep in a function. This function allows us to use some often used
variables by meaningful names and have then automatically replaced by gensyms instead of
having to do the gensyms for every parser function.
This also goes through and looks for symbols that refer to sub code blocks (i.e. sub parser
functions) and then inserts that code in."
(declare (type symbol sym-sm
sym-cm?
sym-c?))
#'(lambda (elm)
(cond ((eql elm 'matched?) sym-cm?)
((eql elm 'consumed?) sym-c?)
((eql elm 'sub-matches) sym-sm)
((and (symbolp elm) (gethash elm *intermediate-parser-fncs*))
(let ((code (gethash elm *intermediate-parser-fncs*)))
(remhash elm *intermediate-parser-fncs*)
code))
(t elm))))
(defun tokenize-sub-parser (code)
"Associate a gensym with a block of code for a sub-parser function so that sub-parser functions
don't interfere with any sub-parser functions they may be within during expression parsing."
(let ((fn-name (gensym)))
(setf (gethash fn-name *intermediate-parser-fncs*) code)
fn-name))
(defmacro parse-strings-as-fns (lst)
"Go through and parse any strings that were assumed as parameters to operators as string
matcher sub-parsers."
`(setf ,lst (map-dfs ,lst :elm-fn #'(lambda (elm)
(if (stringp elm)
(tokenize-sub-parser (parser-make-string-matcher elm))
elm)))))
(defun make-sub-parser (&rest fn-lst)
"Go through all ops recursively and replace instances of 'sub-matches', 'matched?', and
'consumed?' with the appropriate gensyms. No variables of sub-parser will be overwritten because the
parser constructs the pattern function bottom-up."
(declare (type list fn-lst))
(if (and (length fn-lst) (listp (car fn-lst)))
(setf fn-lst (car fn-lst)))
(let ((sub-matches (gensym))
(matched? (gensym))
(consumed? (gensym)))
(tokenize-sub-parser (map-dfs fn-lst :elm-fn (make-parser-replace-symbols
sub-matches
matched?
consumed?)))))
(defun parse-operator (lst)
"Handle a list of tokens either as an call to a pattern operator or a string literal.
If the type of the first element of lst is neither a symbol nor a string then error."
(declare (type list lst))
(let ((a (car lst))
(d (cdr lst)))
(cond ((or (characterp a) (stringp a))
(tokenize-sub-parser (parser-make-string-matcher (string a))))
((and (symbolp a) (gethash a *parser-ops*))
(make-sub-parser (funcall (gethash a *parser-ops*) (as-list d))))
(t (parser-error expr "Unknown type encountered.")))))
(defun parse-operators (block-id elm) ; TODO: should block-id be needed?
"For every car in the parser function parse for operators. Note: this function is applied
to every car by means of a mapcar in defun-parse."
(declare (type symbol block-id)
(type list elm))
`(multiple-value-bind
(sub-matches matched? consumed?)
,(map-dfs elm
:list-fn #'(lambda (lst)
(parse-operator (as-list lst))))
(if matched?
(push-to-end sub-matches ($state-matches)))))
(defmacro defun-parse (name &rest toks)
"Define a named parser function."
(declare (type symbol name)
(type list toks))
(let* ((block-id (gensym))
(parser-fnc-name (make-sub-parser (mapcar #'(lambda (elm)
(parse-operators block-id (as-list elm)))
toks)))
; get the parser function we just created
(code (gethash parser-fnc-name *intermediate-parser-fncs*)))
; empty out the hash table for the next parser function to use
(remhash parser-fnc-name *intermediate-parser-fncs*)
; the final built-up parser function
`(setf (gethash ',name *parser-ops*)
#'(lambda ($state $len)
,(push 'progn code)
($state-matches)))))
(defun parse (fn-name str)
"Parse a string using the specified parser function."
(declare (type symbol fn-name)
(type string str))
(let ((parse-fn (gethash fn-name *parser-ops*)))
(if parse-fn
(let ((state (make-parser-state :buffer str))
(len (length str)))
(funcall parse-fn state len))
(error (format nil "The parser function '~A' does not exist." fn-name)))))
(defmacro set-parser-op (&rest lst)
"Register a parser operator."
(if (not (eq (mod (length lst) 2) 0))
(parser-error expr "set-parser-op only accepts a multiple of 2 arguments."))
`(setf ,@(loop while (> (length lst) 0)
collect `(gethash ',(first lst) *parser-ops*)
collect (second lst)
do (setf lst (subseq lst 2)))))
(set-parser-op find-next #'parser-make-find-next
or #'parser-make-or
and #'parser-make-and
maybe #'parser-make-maybe
pass #'parser-make-pass
group #'parser-make-group
concat #'parser-make-concat
no-capture #'parser-make-no-capture
if #'parser-make-if
repeat #'make-parser-make-repeat)
(print (macroexpand-1 '(defun-parse test
(repeat 0 10 "")
(if (and "hello" " ")
"world"
"boo"))))
#|
(defun-parse test
(repeat 0 10 "")
(if (and "hello" " ")
"world"
"boo"))
(print (parse 'test "hello world"))
|#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment