Skip to content

Instantly share code, notes, and snippets.

@amirouche
Last active September 1, 2017 14:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amirouche/72c24b7122b7067ca4ee03e7a5ba616b to your computer and use it in GitHub Desktop.
Save amirouche/72c24b7122b7067ca4ee03e7a5ba616b to your computer and use it in GitHub Desktop.
;;; combinatorix
;;;
;;; Copyright © 2017 Amirouche Boubekki <amirouche@hypermove.net>
;;;
;;; This module is free software; you can redistribute it and/or
;;; modify it under the terms of the GNU Lesser General Public License
;;; as published by the Free Software Foundation; either version 3 of
;;; the License, or (at your option) any later version.
;;;
;;; This module is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;; Lesser General Public License for more details.
;;;
;;; You should have received a copy of the GNU Lesser General Public
;;; License along with this module. If not, see
;;; <http://www.gnu.org/licenses/>.
;;; Commentary:
;;
;; Parser combinators.
;;
;; TODO:
;;
;; - improve error handling
;; - further optimize
;;
;; Also see:
;;
;; - https://github.com/djspiewak/gll-combinators
;; - https://docs.racket-lang.org/parsack/index.html
;; - https://docs.racket-lang.org/megaparsack/
;;
;;; Code:
(define-module (combinatorix))
(use-modules (srfi srfi-9))
(use-modules (ice-9 match))
(use-modules (ice-9 pretty-print))
(define (pp . args)
(apply pretty-print args)
(car (reverse args)))
;; test macro
(define-syntax-rule (test-check test-name expr expected)
(when (getenv "DEBUG")
(format #t "* ~a: " test-name)
(let ((expr* expr)
(expected* expected))
(if (equal? expr* expected*)
(format #t "PASS :)\n")
(begin
(format #t "FAILED :(\n")
(format #t "** expected: ~a\n" expected*)
(format #t "** found: ~a\n" expr*))))))
;; <xchar> are extended char that keeps track of line, column and
;; offset information to help with debugging parser combinators
(define-record-type <xchar>
(make-xchar char line column offset)
xchar?
(char xchar-char)
(line xchar-line)
(column xchar-column)
(offset xchar-offset))
(define (xchar-format xchar)
(format #f "<xchar ~a (~a, ~a) @ ~a)"
(xchar-char xchar)
(xchar-line xchar)
(xchar-column xchar)
(xchar-offset xchar)))
;; delayed lambda stream
(define %stream-null #(stream-null)) ;; unique object
(define (stream-null? stream)
(eq? (stream) %stream-null))
(define (list->stream lst)
(let loop ((lst lst))
(lambda ()
(if (null? lst)
%stream-null
(cons (car lst) (loop (cdr lst)))))))
(test-check "list->stream of empty list"
((list->stream '()))
%stream-null)
(define (stream->list stream)
(let loop ((stream stream)
(out '()))
(let ((next (stream)))
(if (stream-null? next)
(reverse out)
(loop (cdr next) (cons (car next) out))))))
(define (stream-car stream)
(if (stream-null? stream)
(raise 'empty-stream-error)
(car (stream))))
(define (stream-cdr stream)
(if (stream-null? stream)
(raise 'empty-stream-error)
(cdr (stream))))
;; (define (stream-map proc stream) ;; FIXME: I think it's useless
;; (let loop ((stream stream))
;; (lambda ()
;; (let ((next (stream)))
;; (if (stream-null? next)
;; %stream-null
;; (cons (proc (car next)) (loop (cdr next))))))))
;; string->xchar-stream
(define (string->xchar-stream string)
;; FIXME: optimize
(let loop ((chars (string->list string))
(line 1)
(column 1)
(offset 0)
(out '()))
(if (null? chars)
(list->stream (reverse out))
(if (eq? (car chars) #\newline)
(loop (cdr chars)
(+ 1 line)
1
(+ 1 offset)
(cons (make-xchar #\newline line column offset) out))
(loop (cdr chars)
line
(+ 1 column)
(+ 1 offset)
(cons (make-xchar (car chars) line column offset) out))))))
;; parse
(define-public (parse parser string)
(let ((xchars (string->xchar-stream string)))
(let ((result '()))
(parser '()
xchars
(lambda (out stream) (set! result out))
(lambda (error stream) (set! result error)))
result)))
(define-public (parse-xchar char)
(lambda (ctx stream continue fail)
(if (stream-null? stream)
(fail (format #f "failed: (parse-xchar ~s) because of empty stream" char)
stream)
(let ((xchar (stream-car stream)))
(if (eq? (xchar-char xchar) char)
(continue xchar (stream-cdr stream))
(fail (format #f "failed: (parse-xchar ~s) @ ~s" char xchar) stream))))))
(test-check "parse-xchar"
(xchar-char (parse (parse-xchar #\c) "c"))
#\c)
(define (either one two)
(lambda (ctx stream continue fail)
(one ctx stream continue (lambda _
(two ctx stream continue fail)))))
(test-check "either az 1"
(xchar-char (parse (either (parse-xchar #\a) (parse-xchar #\z)) "a"))
#\a)
(test-check "either az 2"
(xchar-char (parse (either (parse-xchar #\a) (parse-xchar #\z)) "z"))
#\z)
(define (each one two)
(lambda (ctx stream continue fail)
(one ctx
stream
(lambda (out stream)
(two ctx stream (lambda (out2 stream)
(continue (cons out out2) stream)) ;; FIXME?
fail))
fail)))
(test-check "each az"
((match-lambda ((a . b) (cons (xchar-char a) (xchar-char b))))
(parse (each (parse-xchar #\a) (parse-xchar #\z)) "az"))
(cons #\a #\z))
(test-check "each+either ae"
((match-lambda ((a . b) (cons (xchar-char a) (xchar-char b))))
(parse (each (either (parse-xchar #\a) (parse-xchar #\z))
(parse-xchar #\e))
"ae"))
(cons #\a #\e))
(test-check "each+either ze"
((match-lambda ((a . b) (cons (xchar-char a) (xchar-char b))))
(parse (each (either (parse-xchar #\a) (parse-xchar #\z))
(parse-xchar #\e))
"ze"))
(cons #\z #\e))
(define (%either* . parsers)
(lambda (ctx stream continue fail)
(let loop ((parsers parsers))
(if (null? parsers)
(fail "No parsers succeed" stream)
((force (car parsers)) ctx stream continue (lambda _ (loop (cdr parsers))))))))
(define-syntax-rule (either* parser ...)
(%either* (delay parser) ...))
(test-check "either* abc 1"
(xchar-char (parse (either* (parse-xchar #\a) (parse-xchar #\b) (parse-xchar #\c))
"a"))
#\a)
(test-check "either* abc 2"
(xchar-char (parse (either* (parse-xchar #\a) (parse-xchar #\b) (parse-xchar #\c))
"b"))
#\b)
(test-check "either* abc 3"
(xchar-char (parse (either* (parse-xchar #\a) (parse-xchar #\b) (parse-xchar #\c))
"c"))
#\c)
(define (%each* . parsers)
(lambda (ctx stream continue fail)
(let loop ((parsers parsers)
(stream stream)
(out '()))
(if (null? parsers)
(continue (reverse out) stream)
((force (car parsers)) ctx stream (lambda (out2 stream) (loop (cdr parsers) stream (cons out2 out))) fail)))))
(define-syntax-rule (each* parser ...)
(%each* (delay parser) ...))
(test-check "each* abc"
(list->string (map xchar-char (parse (each* (parse-xchar #\a)
(parse-xchar #\b)
(parse-xchar #\c)
(parse-xchar #\d))
"abcd")))
"abcd")
(define-public (process proc parser)
(lambda (ctx stream continue fail)
(parser ctx stream (lambda (out stream) (continue (proc out) stream)) fail)))
(define flatten
(match-lambda ((a b)
(cons a b))
(a (list a))))
(define recur (process flatten
(either (each* (parse-xchar #\a) recur)
(parse-xchar #\a))))
(test-check "recursive aaaa"
(list->string (map xchar-char (parse recur "aaaa")))
"aaaa")
(define-public (zero-or-more parser)
(lambda (ctx stream continue fail)
(let loop ((stream stream)
(out '()))
(parser ctx
stream
(lambda (out2 stream)
(loop stream (cons out2 out)))
(lambda _
(continue (reverse out) stream))))))
(test-check "zero or more 1"
(list->string (map xchar-char (parse (zero-or-more (parse-xchar #\a)) "aaa")))
"aaa")
(test-check "zero or more 2"
(list->string (map xchar-char (parse (zero-or-more (parse-xchar #\a)) "")))
"")
(define-public (one-or-more parser)
(each parser (zero-or-more parser)))
(test-check "one or more"
(list->string (map xchar-char (parse (one-or-more (parse-xchar #\a)) "aaa")))
"aaa")
(test-check "one or more 2"
(parse (one-or-more (parse-xchar #\a)) "")
"failed: (parse-xchar #\\a) because of empty stream")
(define-public (parse-unless predicate parser)
(lambda (ctx stream continue fail)
(predicate ctx stream fail (lambda _ (parser ctx stream continue fail)))))
(define-public parse-any
(lambda (ctx stream continue fail)
(if (stream-null? stream)
(fail "stream null error" stream)
(continue (stream-car stream) (stream-cdr stream)))))
(define-record-type <token>
(make-token kind string line column offset)
token?
(kind token-kind)
(string token-string)
(line token-line)
(column token-column)
(offset token-offset))
(define string-sanitize
(match-lambda
(($ <xchar> char _ ....) (list->string (list char)))
((a . b) (apply string-append (map list->string (list (list (xchar-char a)) (list (xchar-char b))))))))
(define (->string out)
(let ((head (car out)))
(make-token 'string
(apply string-append `("\"" ,(apply string-append (map string-sanitize (cadr out))) "\""))
(xchar-line head)
(xchar-column head)
(xchar-offset head))))
(define <string>
(process ->string
(each* (parse-xchar #\")
(zero-or-more
(parse-unless (parse-xchar #\")
(either (each (parse-xchar #\\)
(parse-xchar #\"))
parse-any)))
(parse-xchar #\"))))
(test-check "<string> 1"
(token-string (parse <string> "\"abc\""))
"\"abc\"")
(test-check "<string> 2"
(token-string (parse <string> "\"a\\\"b\""))
"\"a\\\"b\"")
(define <space> (process (lambda _ #f)
(either (parse-xchar #\space)
(parse-xchar #\newline))))
(define (->identifier value)
(let ((head (car value)))
(make-token 'identifier
(list->string (map xchar-char value))
(xchar-line head)
(xchar-column head)
(xchar-offset head))))
(define <identifier>
(process ->identifier
(one-or-more (parse-unless (either* <space> (parse-xchar #\)) (parse-xchar #\())
parse-any))))
(test-check "<identifier>"
(token-string (parse <identifier> "abcdef"))
"abcdef")
(define (xchar->token kind string)
(lambda (xchar)
(make-token kind
string
(xchar-line xchar)
(xchar-column xchar)
(xchar-offset xchar))))
(define ->paren-open (xchar->token 'paren-open "("))
(define <paren-open> (process ->paren-open
(parse-xchar #\()))
(define ->paren-close (xchar->token 'paren-close ")"))
(define <paren-close> (process ->paren-close
(parse-xchar #\))))
(define <atom> (either* <sexp>
<identifier>
<string>
<space>))
(define ->sexp
(match-lambda ((paren-open _ atoms paren-close)
`(,paren-open ,@(filter (lambda (x) (not (eq? x #f))) atoms) ,paren-close))))
(define <sexp> (process ->sexp (each* <paren-open>
(zero-or-more <space>)
(zero-or-more <atom>)
<paren-close>)))
(test-check "<sexp>"
(map token-string (parse <sexp> "(abc def )"))
'("(" "abc" "def" ")"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment