Skip to content

Instantly share code, notes, and snippets.

@draftcode
Created August 5, 2010 01:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save draftcode/509096 to your computer and use it in GitHub Desktop.
Save draftcode/509096 to your computer and use it in GitHub Desktop.
lexer generator written in scheme
#!/usr/bin/env gosh
(use srfi-1)
(use srfi-13)
;; {{{ utility functions
(define (unique-list eq l)
(if (null? l) l (lset-adjoin eq (unique-list eq (cdr l)) (car l) )))
(define (make-pair l)
(if (= (length l) 2) (list l)
(append (map (lambda (x) (cons (car l) x)) (cdr l))
(make-pair (cdr l)))))
(define (part pred l)
(let loop ((l l) (t '()) (f '()))
(if (null? l) (list t f)
(if (pred (car l))
(loop (cdr l) (cons (car l) t) f)
(loop (cdr l) t (cons (car l) f))))))
(define (clustering pred l)
(if (null? l) l
(let1 r (part (lambda (x) (pred (car l) x)) l)
(cons (car r) (clustering pred (cadr r))))))
(define (cross-map pred . ll)
(if (null? (cdr ll))
(map pred (car ll))
(append-map (lambda (x) (apply cross-map (pa$ pred x) (cdr ll)))
(car ll))))
(define (filter-cross-map pred . ll)
(if (null? (cdr ll))
(filter-map pred (car ll))
(append-map (lambda (x) (apply filter-cross-map (pa$ pred x) (cdr ll)))
(car ll))))
(define (map-with-number pred num . ll)
(if (any null? ll) '()
(cons (apply pred num (map car ll))
(apply map-with-number pred (+ num 1) (map cdr ll)))))
;; }}}
;; {{{ automaton construct
(define (new-automaton S E N X T C)
(list S E N X T C))
(define genstate gensym)
(define (start A) (car A))
(define (accept-states A) (second A))
(define (states A) (third A))
(define (chars A) (fourth A))
(define (trans A) (fifth A))
(define (tokentag A) (sixth A))
(define (new-transition from char to)
(list from char to))
(define (from-state t) (car t))
(define (trans-char t) (second t))
(define (to-state t) (third t))
(define (new-tokentag state tags)
(cons state tags))
(define (tokentag-state p) (car p))
(define (tokentag-tags p) (cdr p))
(define (is-accept-state? A s)
(if (find (cut eq? <> s) (accept-states A)) #t #f))
(define (get-next-state A s x)
(map to-state
(filter (lambda (t) (and (eq? s (from-state t))
(eq? x (trans-char t))))
(trans A))))
(define (get-next-states A S C)
(cond ((and (list? S) (list? C))
(map (lambda (s) (map (cut get-next-state s <>) C)) S))
((list? C) (map (cut get-next-state A S <>) C))
((list? S) (map (cut get-next-state A <> C) S))
(else (list (get-next-state A S C)))))
(define (get-tokentags A s)
(let1 r (assq s (tokentag A))
(if r (tokentag-tags r) r)))
(define (eps-closure A S) ;; {{{
(define (eps-closure-1 A s)
(let loop ((eps-states (list s)))
(let1 next-states
(apply lset-union eq? eps-states
(map
(lambda (s) (filter-map (lambda (t) (if (and (eq? s (from-state t))
(eq? 'eps (trans-char t)))
(to-state t) #f))
(trans A)))
eps-states))
(if (= (length next-states) (length eps-states))
eps-states
(loop next-states)))))
(if (list? S)
(apply lset-union eq?
(map (cut eps-closure-1 A <>) S))
(eps-closure-1 A S))) ;; }}}
;; }}}
;; {{{ regexp -> NFA
(define (construct-automaton-str str) ;; {{{
(let1 start (genstate)
(let loop ((l (string->list str))
(curr start)
(states (list start))
(tran '()))
(if (null? l) (new-automaton start (list curr) states
(unique-list eq? (string->list str))
tran '())
(let1 newstate (genstate)
(loop (cdr l)
newstate
(cons newstate states)
(cons (new-transition curr (car l) newstate)
tran))))))) ;; }}}
(define (construct-automaton-any) ;; {{{
(let ((start (genstate))
(endstate (genstate))
(charlist (string->list "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")))
(new-automaton start (list endstate) (list start endstate) charlist
(map (lambda (c) (new-transition start c endstate))
charlist)
'()))) ;; }}}
(define (construct-automaton-digit) ;; {{{
(let ((start (genstate))
(endstate (genstate))
(charlist (string->list "0123456789")))
(new-automaton start (list endstate) (list start endstate) charlist
(map (lambda (c) (new-transition start c endstate))
charlist)
'()))) ;; }}}
(define (automaton-attach-token A c) ;; {{{
(new-automaton
(start A)
(accept-states A)
(states A)
(chars A)
(trans A)
(append (tokentag A)
(map (lambda (s) (new-tokentag s c)) (accept-states A))))) ;; }}}
(define (combine-automaton-or A B) ;; {{{
(let ((startstate (genstate)))
(new-automaton
startstate
(append (accept-states A) (accept-states B))
(append (states A) (states B) (list startstate))
(lset-union eq? (chars A) (chars B))
(append (trans A) (trans B)
(list (new-transition startstate 'eps (start A))
(new-transition startstate 'eps (start B))))
(append (tokentag A) (tokentag B))))) ;; }}}
(define (combine-automaton-and A B) ;; {{{
(let ((startstate (genstate)) (midstate (genstate)) (endstate (genstate)))
(new-automaton
startstate (list endstate)
(append (states A) (states B) (list startstate midstate endstate))
(lset-union eq? (chars A) (chars B))
(append (trans A) (trans B)
(list (new-transition startstate 'eps (start A))
(new-transition midstate 'eps (start B)))
(map (lambda (s) (new-transition s 'eps midstate)) (accept-states A))
(map (lambda (s) (new-transition s 'eps endstate)) (accept-states B)))
'()))) ;; }}}
(define (combine-automaton-closure A) ;; {{{
(let ((startstate (genstate)) (midstart (genstate)) (midend (genstate)) (endstate (genstate)))
(new-automaton
startstate (list endstate)
(append (states A) (list startstate midstart midend endstate))
(chars A)
(append (trans A)
(list (new-transition startstate 'eps endstate)
(new-transition startstate 'eps midstart)
(new-transition midend 'eps midstart)
(new-transition midend 'eps endstate)
(new-transition midstart 'eps (start A)))
(map (lambda (s) (list s 'eps midend)) (accept-states A)))
'()))) ;; }}}
(define (construct-automaton regextree) ;; {{{
(cond
((eq? 'any regextree) (construct-automaton-any))
((eq? 'digit regextree) (construct-automaton-digit))
((string? regextree) (construct-automaton-str regextree))
((eq? 'token (first regextree))
(automaton-attach-token (construct-automaton (second regextree))
(cddr regextree)))
((eq? '+ (first regextree))
(combine-automaton-and (construct-automaton (second regextree))
(combine-automaton-closure
(construct-automaton (second regextree)))))
((eq? 'or (first regextree))
(fold-right (lambda (x r) (if (null? r)
(construct-automaton x)
(combine-automaton-or (construct-automaton x) r)))
'()
(cdr regextree)))
((eq? 'and (first regextree))
(fold-right (lambda (x r) (if (null? r)
(construct-automaton x)
(combine-automaton-and (construct-automaton x) r)))
'()
(cdr regextree)))
((eq? '* (first regextree))
(combine-automaton-closure
(construct-automaton (second regextree)))))) ;; }}}
;; }}}
;; {{{ NFA->DFA
(define (NFA->DFA A)
(define (renumber-state states trans)
(let1 states-pair (map (lambda (S) (cons (genstate) S)) states)
(define (asc S)
(any (lambda (p) (if (lset= eq? S (cdr p)) (car p) #f))
states-pair))
(new-automaton
(asc (eps-closure A (start A)))
(filter-map (lambda (p) (if (null? (lset-intersection eq? (cdr p) (accept-states A)))
#f (car p))) states-pair)
(map car states-pair)
(chars A)
(map (lambda (t)
(new-transition (asc (from-state t)) (trans-char t) (asc (to-state t))))
trans)
(filter-map (lambda (p)
(let1 r (filter-map (lambda (s) (get-tokentags A s)) (cdr p))
(if (null? r) #f
(new-tokentag (car p)
(unique-list eq? (apply append r))))))
states-pair))))
(let loop ((finished (list))
(newtrans (list))
(notyet (list (eps-closure A (start A)))))
(if (null? notyet) (renumber-state finished newtrans)
(let ((new-add-trans
(filter-map
(lambda (c)
(let1 to-state (eps-closure A (apply append (get-next-states A (car notyet) c)))
(if (null? to-state) #f (new-transition (car notyet) c to-state))))
(chars A)))
(new-finished (cons (car notyet) finished)))
(loop new-finished
(append new-add-trans newtrans)
(lset-union
(cut lset= eq? <> <>)
(cdr notyet)
(filter-map (lambda (t)
(if (find (lambda (S) (lset= eq? S (to-state t)))
new-finished)
#f (to-state t))) new-add-trans)))))))
;; }}}
;; {{{ cut-unused-state
(define (cut-unused-state A)
(let* ((usedstate
(let loop ((used (list (start A))))
(let1 next-used
(unique-list
eq? (append
used (filter-cross-map
(lambda (s t)
(if (eq? (from-state t) s) (to-state t) #f))
used (trans A))))
(if (= (length used) (length next-used))
used (loop next-used)))))
(usedtransition
(filter (lambda (t) (find (cut eq? (from-state t) <>) usedstate))
(trans A))))
(new-automaton
(start A)
(filter (lambda (s) (find (cut eq? s <>) usedstate))
(accept-states A))
usedstate
(unique-list (cut eq? <> <>)
(map (cut trans-char <>) usedtransition))
usedtransition
(filter (lambda (p) (find (cut eq? (tokentag-state p) <>) usedstate))
(tokentag A)))))
;; }}}
;; {{{ minimize-DFA
(define (minimize-DFA A)
(define (renumber-state states)
(let1 states-pair (map (cut cons (genstate) <>) states)
(define (asc s)
(any (lambda (p) (if (find (cut eq? s <>) (cdr p)) (car p) #f))
states-pair))
(cut-unused-state
(new-automaton
(car (find (lambda (p) (find (lambda (s) (eq? s (start A))) (cdr p)))
states-pair))
(filter-map (lambda (p) (if (null? (lset-intersection eq? (cdr p) (accept-states A)))
#f (car p))) states-pair)
(map car states-pair)
(chars A)
(append-map
(lambda (p) (filter-map (lambda (c)
(let1 next (get-next-state A (cadr p) c)
(if (null? next) #f
(new-transition (car p) c (asc (car next))))))
(chars A)))
states-pair)
(filter-map (lambda (p)
(let1 r (filter-map (lambda (s) (get-tokentags A s)) (cdr p))
(if (null? r) #f (cons (car p) (unique-list eq? (apply append r))))))
states-pair)))))
(let loop ((l (clustering
(lambda (s1 s2)
(cond ((not (eq? (is-accept-state? A s1)
(is-accept-state? A s2))) #f)
((not (equal? (get-tokentags A s1) (get-tokentags A s2))) #f)
(else #t)))
(states A))))
(define (convert-to-group s)
(if (null? s) '()
(find (lambda (G) (find (lambda (gs) (eq? gs (car s))) G)) l)))
(define (grouping G)
(map (cut map car <>)
(clustering
(lambda (gl1 gl2)
(fold (lambda (g1 g2 r) (and r (lset= eq? g1 g2))) #t
(cdr gl1) (cdr gl2)))
(map (lambda (s)
(cons s (map convert-to-group (get-next-states A s (chars A)))))
G))))
(let1 new-l (append-map (lambda (G) (grouping G)) l)
(if (= (length new-l) (length l))
(renumber-state l)
(loop new-l)))))
;; }}}
;; {{{ simulate-DFA
(define (simulate-DFA A str)
(let loop ((l (string->list str))
(state (start A)))
;; Accept if and only if str is all consumed and last state is one of accept states.
(if (null? l) (find (cut eq? state <>) (accept-states A))
(let1 next-state (get-next-state A state (car l))
(if (or (null? next-state) (> 1 (length next-state)))
#f ;; If next state is nothing or are more than two states then str is not acceptable.
(loop (cdr l) (car next-state)))))))
;; }}}
;; {{{ print-graphviz-DFA
(define (print-graphviz-DFA A)
(print "digraph finite_state_machine {")
(print "\trankdir=LR;")
(print "\tsize=\"800,500\"")
(display "\tnode [shape = doublecircle];")
(map (lambda (s) (format #t " ~A" s)) (accept-states A))
(print ";")
(print "\tnode [shape = circle];")
(map (lambda (t)
(format #t "\t~A -> ~A [ label = \"~A\" ];\n"
(from-state t) (to-state t) (trans-char t)))
(trans A))
(print "}"))
;; }}}
;; {{{ print-C-code
(define (print-C-code header-out-port impl-out-port
A optional-token tokentag-string after-accept-code)
(define (state->tokenstring s)
(if-let1 tag (get-tokentags A s) (cdr (assoc tag tokentag-string)) #f))
(format header-out-port "#ifndef LEXER_HEADER_INCLUDED\n")
(format header-out-port "#define LEXER_HEADER_INCLUDED\n")
(newline header-out-port)
(format header-out-port "#include <stdbool.h>\n")
(format header-out-port "enum tokens {\n")
(map-with-number
(lambda (n token) (format header-out-port " ~A = ~D,\n" token n)) 1
(fold (lambda (s r) (lset-adjoin eq? r (state->tokenstring s)))
optional-token (accept-states A)))
(format header-out-port "};\n")
(newline header-out-port)
(format header-out-port "bool get_next_token(const char **input, enum tokens *token, long *num, const char **str);\n")
(newline header-out-port)
(format header-out-port "#endif /* end of include guard */")
(format impl-out-port "#include \"lex.h\"\n")
(format impl-out-port "#include <stdlib.h>\n")
(format impl-out-port "#include <string.h>\n")
(format impl-out-port "#define NUM_STATES ~D\n" (+ 1 (length (states A))))
(format impl-out-port "#define START_STATE ~A\n" (start A))
(newline impl-out-port)
(format impl-out-port "enum state {\n")
(format impl-out-port " INVALID_STATE = 0,\n")
(map (lambda (s) (format impl-out-port " ~A,\n" s)) (states A))
(format impl-out-port "};\n")
(newline impl-out-port)
(format impl-out-port "enum state trans[128][NUM_STATES] = {\n")
(map (lambda (c)
(format impl-out-port " { INVALID_STATE")
(map (lambda (s)
(let1 to_list (get-next-state A s c)
(if (null? to_list)
(format impl-out-port ", INVALID_STATE")
(format impl-out-port ", ~13A" (car to_list)))))
(states A))
(format impl-out-port " },\n"))
(map (cut integer->char <>) (iota 128)))
(format impl-out-port "};\n")
(newline impl-out-port)
(format impl-out-port "unsigned long accept_state[NUM_STATES] = {\n")
(format impl-out-port " 0, // INVALID_STATE\n")
(map (lambda (s) (if (is-accept-state? A s)
(format impl-out-port " ~A,\n" (state->tokenstring s))
(format impl-out-port " 0,\n")))
(states A))
(format impl-out-port "};")
(newline impl-out-port)
(newline impl-out-port)
(format impl-out-port "//////////////////////////////////////////////////////////////////////\n")
(newline impl-out-port)
(format impl-out-port "bool get_next_token(const char **input, enum tokens *token, long *num, const char **str)\n")
(format impl-out-port "{\n")
(format impl-out-port " enum state last_accept_state = INVALID_STATE;\n")
(format impl-out-port " const char *last_accept_pos = NULL;\n")
(format impl-out-port " enum state s = START_STATE;\n")
(format impl-out-port " const char *curr = *input;\n")
(format impl-out-port "\n")
(format impl-out-port " while (s != INVALID_STATE) {\n")
(format impl-out-port " s = trans[(size_t)*curr][s];\n")
(format impl-out-port " if (accept_state[s] > 0) {\n")
(format impl-out-port " last_accept_state = s;\n")
(format impl-out-port " last_accept_pos = curr;\n")
(format impl-out-port " }\n")
(format impl-out-port " curr++;\n")
(format impl-out-port " }\n")
(format impl-out-port "\n")
(format impl-out-port " if (last_accept_state == INVALID_STATE) {\n")
(format impl-out-port " return false;\n")
(format impl-out-port " }\n")
(format impl-out-port "\n")
(format impl-out-port " *token = accept_state[last_accept_state];\n")
(format impl-out-port " last_accept_pos++;\n")
(format impl-out-port " *str = *input; *num = last_accept_pos - *input;\n")
(format impl-out-port " *input = last_accept_pos + strspn(last_accept_pos, \" \\t\\n\\r\");\n")
(format impl-out-port "\n")
(format impl-out-port " char lextext[256];\n")
(format impl-out-port " strncpy(lextext, *str, *num);\n")
(format impl-out-port " lextext[*num] = '\\0';\n")
(format impl-out-port "\n")
(if (not (null? after-accept-code))
(begin
(format impl-out-port " switch (last_accept_state) {\n")
(map (lambda (p)
(let1 S (filter (lambda (s) (eq? (get-tokentags A s) (car p)))
(accept-states A))
(if (not (null? S))
(begin (map (lambda (s) (format impl-out-port " case ~A:\n" s)) S)
(map (lambda (l) (format impl-out-port " ~A\n" l))
(cdr p))
(format impl-out-port " break;\n")))))
after-accept-code)
(format impl-out-port " default:\n")
(format impl-out-port " break;\n")
(format impl-out-port " }\n")
(format impl-out-port "\n")))
(format impl-out-port " return true;\n")
(format impl-out-port "}\n"))
;; }}}
;; {{{ reduce-unused-tokentag
(define (reduce-unused-tokentag A tokentag-priority)
(define (get-highest-priority-tokentag tags)
(any (lambda (pt) (find (lambda (t) (eq? pt t)) tags))
tokentag-priority))
(new-automaton
(start A) (accept-states A) (states A) (chars A) (trans A)
(map (lambda (ttpair)
(new-tokentag (tokentag-state ttpair)
(get-highest-priority-tokentag
(tokentag-tags ttpair))))
(tokentag A))))
;; }}}
;; {{{ lexgen
(define (lexgen header-out-port impl-out-port l optional-token after-accept-code)
(let ((regex-list
(cons 'or
(map (lambda (x)
(cond ((string? x)
(list 'token x (string->symbol x)))
((list? x)
(cons 'token x))))
l)))
(priority-list
(map (lambda (x)
(cond ((string? x) (string->symbol x))
((list? x) (last x))))
l))
(tokentag-string
(map (lambda (x)
(cond ((string? x) (cons (string->symbol x)
(string-upcase x)))
((list? x) (cons (last x)
(string-upcase (symbol->string (last x)))))))
l)))
(print-C-code header-out-port impl-out-port
(reduce-unused-tokentag
(minimize-DFA (NFA->DFA (construct-automaton regex-list)))
priority-list)
optional-token
tokentag-string after-accept-code)))
;; }}}
(lexgen (open-output-file "lex.h") (open-output-file "lex.c")
'("add" "addi" "addiu" "addu" "and" "andi" "beq" "bne" "j" "jal" "jr"
"lbu" "lhu" "lui" "lw" "nor" "or" "ori" "slt" "slti" "sltiu" "sltu"
"sll" "srl" "sb" "sh" "sw" "sub" "subu"
("," comma) ("(" paren_open) (")" paren_close)
("$zero" reg_zero) ("$at" reg_at)
("$v0" reg_v0) ("$v1" reg_v1)
("$a0" reg_a0) ("$a1" reg_a1) ("$a2" reg_a2) ("$a3" reg_a3)
("$t0" reg_t0) ("$t1" reg_t1) ("$t2" reg_t2) ("$t3" reg_t3)
("$t4" reg_t4) ("$t5" reg_t5) ("$t6" reg_t6) ("$t7" reg_t7)
("$s0" reg_s0) ("$s1" reg_s1) ("$s2" reg_s2) ("$s3" reg_s3)
("$s4" reg_s4) ("$s5" reg_s5) ("$s6" reg_s6) ("$s7" reg_s7)
("$t8" reg_t8) ("$t9" reg_t9)
("$k0" reg_k0) ("$k1" reg_k1)
("$gp" reg_gp) ("$sp" reg_sp) ("$fp" reg_fp) ("$ra" reg_ra)
((+ digit) num)
((+ any) label_from)
((and (+ any) ":") label_dest)
)
'("REG")
'((num "*num = strtod(lextext, NULL);")
(reg_zero "*token = REG; *num = 0;")
(reg_at "*token = REG; *num = 1;")
(reg_v0 "*token = REG; *num = 2;")
(reg_v1 "*token = REG; *num = 3;")
(reg_a0 "*token = REG; *num = 4;")
(reg_a1 "*token = REG; *num = 5;")
(reg_a2 "*token = REG; *num = 6;")
(reg_a3 "*token = REG; *num = 7;")
(reg_t0 "*token = REG; *num = 8;")
(reg_t1 "*token = REG; *num = 9;")
(reg_t2 "*token = REG; *num = 10;")
(reg_t3 "*token = REG; *num = 11;")
(reg_t4 "*token = REG; *num = 12;")
(reg_t5 "*token = REG; *num = 13;")
(reg_t6 "*token = REG; *num = 14;")
(reg_t7 "*token = REG; *num = 15;")
(reg_s0 "*token = REG; *num = 16;")
(reg_s1 "*token = REG; *num = 17;")
(reg_s2 "*token = REG; *num = 18;")
(reg_s3 "*token = REG; *num = 19;")
(reg_s4 "*token = REG; *num = 20;")
(reg_s5 "*token = REG; *num = 21;")
(reg_s6 "*token = REG; *num = 22;")
(reg_s7 "*token = REG; *num = 23;")
(reg_t8 "*token = REG; *num = 24;")
(reg_t9 "*token = REG; *num = 25;")
(reg_k0 "*token = REG; *num = 26;")
(reg_k1 "*token = REG; *num = 27;")
(reg_gp "*token = REG; *num = 28;")
(reg_sp "*token = REG; *num = 29;")
(reg_fp "*token = REG; *num = 30;")
(reg_ra "*token = REG; *num = 31;"))
)
@draftcode
Copy link
Author

初期状態が受理状態であるような正規言語に対して, 正しい動作をするコードを吐かない気がする.
が, そんなにまじめなコードでもないから対応しない.

@draftcode
Copy link
Author

  • 数字の認識がイマイチ
  • 空列を認識するような正規言語を受理するようなコードを吐かない. (lexerとしては問題ない気がする).
  • any とか digitを使うと劇的に重くなる

@draftcode
Copy link
Author

  • 数字の認識がイマイチ
  • 空列を認識するような正規言語を受理するようなコードを吐かない. (lexerとしては問題ない気がする).
  • any とか digitを使うと劇的に重くなる

@draftcode
Copy link
Author

  • minimizeで到達できない状態を削除するようにする.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment