Last active
February 7, 2019 10:35
-
-
Save rain-1/615cec52cf9ce92c07d3f76e74da162c to your computer and use it in GitHub Desktop.
parsing with grammars
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
;(start, (ACCEPT)) | |
;(start, (READ #\a, sym)) | |
;(start, (READ #\b, sym)) | |
;(start, (READ #\(, "sexps"), PUSH(#\))) | |
;(sym, (ACCEPT)) | |
;(sym, (READ #\a, sym)) | |
;(sym, (READ #\b, sym)) | |
;(sexps, (ACCEPT)) | |
;(sexps, (READ #\a, sym, PUSH(sexps)) | |
;(sexps, (READ #\b, sym, PUSH(sexps)) | |
;(sexps, (READ #\(, sexps), PUSH #\), PUSH sexps) | |
(define (push stack . elements) | |
(append elements stack)) | |
(define (complete-stack stack) | |
;; (write `(draining ,stack))(newline) | |
(if (null? stack) | |
#t | |
(let ((head (car stack)) | |
(stack (cdr stack))) | |
(cond ((char? head) | |
(let ((ch (read-char))) | |
(if (equal? ch head) | |
(complete-stack stack) | |
(error "xy problem" (cons ch head))))) | |
((procedure? head) | |
(head stack)) | |
(else (error "stack")))))) | |
(define (start stack) | |
(let ((ch (peek-char))) | |
(if (eof-object? ch) | |
(complete-stack stack) | |
(case ch | |
((#\a) (read-char) (sym stack)) | |
((#\b) (read-char) (sym stack)) | |
((#\() (read-char) (sexps (push stack #\)))) | |
(else (complete-stack stack)))))) | |
(define (sym stack) | |
(let ((ch (peek-char))) | |
(if (eof-object? ch) | |
(complete-stack stack) | |
(case ch | |
((#\a) (read-char) (sym stack)) | |
((#\b) (read-char) (sym stack)) | |
(else (complete-stack stack)))))) | |
(define (sexps stack) | |
(let ((ch (peek-char))) | |
(if (eof-object? ch) | |
(complete-stack stack) | |
(case ch | |
((#\a) (read-char) (sym (push stack sexps))) | |
((#\b) (read-char) (sym (push stack sexps))) | |
((#\() (read-char) (sexps (push stack #\) sexps))) | |
(else (complete-stack stack)))))) | |
(define (go str) | |
(with-input-from-string str | |
(lambda () | |
(start '())))) | |
(go "ab") | |
(go "(ababab)") | |
(go "((a(ba)b(ab))(ab(a)a)(b(b)aaa)a)") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
(require racket/trace) | |
(define (complete-stack stack) | |
; (write `(draining ,stack))(newline) | |
(if (null? stack) | |
#t | |
(let ((head (car stack)) | |
(stack (cdr stack))) | |
(cond ((char? head) | |
(let ((ch (read-char))) | |
(if (equal? ch head) | |
(complete-stack stack) | |
(error "xy problem" (cons ch head))))) | |
((procedure? head) | |
(head stack)) | |
(else (error "stack" head)))))) | |
(define-syntax define-parse-rule | |
(syntax-rules () | |
((_ <name> () (<char> . <work>) ...) | |
(define-parse-rule^ stack <name> (complete-stack stack) (<char> . <work>) ...)) | |
((_ <name> (<char> . <work>) ...) | |
(define-parse-rule^ stack <name> (error "early-eof" '<name>) (<char> . <work>) ...)))) | |
(define-syntax define-parse-rule^ | |
(syntax-rules () | |
((_ stack <name> <eof> (<char> . <work>) ...) | |
(define (<name> stack) | |
(let ((ch (peek-char))) | |
(if (eof-object? ch) | |
<eof> | |
(case ch | |
((<char>) (read-char) (complete-stack (append (list . <work>) stack))) | |
... | |
(else <eof>)))))))) | |
(define-parse-rule sexp | |
(#\a) | |
(#\' sexp) | |
(#\( sexps #\))) | |
(define-parse-rule sexps | |
() | |
(#\a sexps1) | |
(#\' sexp sexps1) | |
(#\( sexps #\) sexps1)) | |
(define-parse-rule sexps1 | |
() | |
(#\. sexp) | |
(#\a sexps1) | |
(#\' sexp sexps1) | |
(#\( sexps #\) sexps1)) | |
;(trace sexp) | |
;(trace sexps) | |
;(trace sexps^) | |
(define (go str) | |
(with-input-from-string str | |
(lambda () | |
(sexp '())))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <assert.h> | |
#define DEBUG | |
// | |
#define die(msg) do { fprintf(stderr, "Oops in %s, line %d: %s\n", __FUNCTION__, __LINE__, msg); exit(1); } while(0) | |
int fpeek(FILE *fptr) { | |
int c = fgetc(fptr); | |
ungetc(c, fptr); | |
return c; | |
} | |
// | |
typedef enum { | |
TOKEN_ATOM, | |
TOKEN_OPEN_PAREN, | |
TOKEN_CLOSE_PAREN, | |
TOKEN_DOT, | |
TOKEN_QUOTE, | |
TOKEN_EOF, | |
PARSE_SEXP, | |
PARSE_SEXPS, | |
PARSE_SEXPS1, | |
} scm_token_type; | |
typedef struct { | |
scm_token_type ty; | |
int len; | |
char *text; | |
} scm_token; | |
void display_token(scm_token t) { | |
switch(t.ty) { | |
case TOKEN_ATOM: | |
printf("%s", t.text); | |
break; | |
case TOKEN_OPEN_PAREN: | |
printf("("); | |
break; | |
case TOKEN_CLOSE_PAREN: | |
printf(")"); | |
break; | |
case TOKEN_DOT: | |
printf("."); | |
break; | |
case TOKEN_QUOTE: | |
printf("."); | |
break; | |
case TOKEN_EOF: | |
printf("<eof>"); | |
break; | |
case PARSE_SEXP: | |
printf("<sexp>"); | |
break; | |
case PARSE_SEXPS: | |
printf("<sexps"); | |
break; | |
case PARSE_SEXPS1: | |
printf("<sexps1>"); | |
break; | |
default: | |
die("No such token."); | |
} | |
} | |
int whitespace_char(int c) { | |
return c == ' ' || c == '\n'; | |
} | |
int atom_char(int c) { | |
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-'; | |
} | |
scm_token buffered_token; | |
int have_buffered_token = 0; | |
void unread_token(scm_token t) { | |
assert(!have_buffered_token); | |
have_buffered_token = 1; | |
buffered_token = t; | |
} | |
scm_token next_token(FILE* fptr) { | |
if(have_buffered_token) { | |
have_buffered_token = 0; | |
return buffered_token; | |
} | |
// skip whitespace | |
do { | |
if(whitespace_char(fpeek(fptr))) | |
fgetc(fptr); | |
else if(fpeek(fptr) == ';') | |
while(fgetc(fptr) != '\n') {} | |
else | |
break; | |
} while(1); | |
scm_token t; | |
int c = fgetc(fptr); | |
if(c == EOF) | |
t.ty = TOKEN_EOF; | |
else if(c == '(') | |
t.ty = TOKEN_OPEN_PAREN; | |
else if(c == ')') | |
t.ty = TOKEN_CLOSE_PAREN; | |
else if(c == '.') | |
t.ty = TOKEN_DOT; | |
else if(c == '\'') | |
t.ty = TOKEN_QUOTE; | |
else if(atom_char(c)) { | |
t.ty = TOKEN_ATOM; | |
t.len = 0; | |
t.text = malloc(1); | |
ungetc(c, fptr); | |
while(atom_char(fpeek(fptr))) { | |
t.text = realloc(t.text, t.len+1); | |
t.text[t.len] = fgetc(fptr); | |
t.len++; | |
} | |
t.text[t.len] = 0; | |
} | |
else { | |
die("Invalid character"); | |
} | |
return t; | |
} | |
// | |
#define STACK_DEPTH 256 | |
scm_token parser_work[STACK_DEPTH]; | |
int parser_work_top = 0; | |
scm_token parser_res[STACK_DEPTH]; | |
int parser_res_top = 0; | |
void parse(FILE *fptr) { | |
scm_token w, t; | |
while(parser_work_top > 0) { | |
t = next_token(fptr); | |
#ifdef DEBUG | |
printf("T:"); | |
display_token(t); | |
puts(""); | |
#endif | |
w = parser_work[--parser_work_top]; | |
switch(w.ty) { | |
case PARSE_SEXP: | |
switch(t.ty) { | |
case TOKEN_ATOM: | |
break; | |
case TOKEN_QUOTE: | |
parser_work[parser_work_top++] = (scm_token){ .ty = PARSE_SEXP }; | |
break; | |
case TOKEN_OPEN_PAREN: | |
parser_work[parser_work_top++] = (scm_token){ .ty = TOKEN_CLOSE_PAREN }; | |
parser_work[parser_work_top++] = (scm_token){ .ty = PARSE_SEXPS }; | |
break; | |
default: | |
die("parse sexp: unexpected token"); | |
} | |
break; | |
case PARSE_SEXPS: | |
switch(t.ty) { | |
case TOKEN_ATOM: | |
parser_work[parser_work_top++] = (scm_token){ .ty = PARSE_SEXPS1 }; | |
break; | |
case TOKEN_QUOTE: | |
parser_work[parser_work_top++] = (scm_token){ .ty = PARSE_SEXPS1 }; | |
parser_work[parser_work_top++] = (scm_token){ .ty = PARSE_SEXPS }; | |
break; | |
case TOKEN_OPEN_PAREN: | |
parser_work[parser_work_top++] = (scm_token){ .ty = PARSE_SEXPS1 }; | |
parser_work[parser_work_top++] = (scm_token){ .ty = TOKEN_CLOSE_PAREN }; | |
parser_work[parser_work_top++] = (scm_token){ .ty = PARSE_SEXPS }; | |
break; | |
default: | |
unread_token(t); | |
} | |
break; | |
case PARSE_SEXPS1: | |
switch(t.ty) { | |
case TOKEN_DOT: | |
parser_work[parser_work_top++] = (scm_token){ .ty = PARSE_SEXP }; | |
break; | |
case TOKEN_ATOM: | |
parser_work[parser_work_top++] = (scm_token){ .ty = PARSE_SEXPS1 }; | |
break; | |
case TOKEN_QUOTE: | |
parser_work[parser_work_top++] = (scm_token){ .ty = PARSE_SEXPS1 }; | |
parser_work[parser_work_top++] = (scm_token){ .ty = PARSE_SEXP }; | |
break; | |
case TOKEN_OPEN_PAREN: | |
parser_work[parser_work_top++] = (scm_token){ .ty = PARSE_SEXPS1 }; | |
parser_work[parser_work_top++] = (scm_token){ .ty = TOKEN_CLOSE_PAREN }; | |
parser_work[parser_work_top++] = (scm_token){ .ty = PARSE_SEXPS }; | |
break; | |
default: | |
unread_token(t); | |
} | |
break; | |
default: | |
if(t.ty != w.ty) { | |
#ifdef DEBUG | |
puts("wanted: "); | |
display_token(w); | |
puts(""); | |
puts("got: "); | |
display_token(t); | |
puts(""); | |
#endif | |
die("parse: xy problem"); | |
} | |
} | |
} | |
} | |
// | |
int main(void) { | |
scm_token t; | |
t.ty = PARSE_SEXP; | |
parser_work[parser_work_top++] = t; | |
parse(stdin); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment