Last active
September 21, 2017 08:17
-
-
Save yangyuqian/d3c1c74e5773fe1844147d073367a12b to your computer and use it in GitHub Desktop.
lexical parser example to parse and verify a expression `<int64> + <int64>`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
A simple lexical parser for mathematical expression, | |
i.e. | |
1 + 2 => {1, 2, "+"} | |
1+2 => {1, 2, "+"} | |
1 => error | |
1 + => error | |
1 + A => error | |
a A C => error | |
To make it simple only "+" is supported as a operator, | |
only integer is supported. | |
*/ | |
package main | |
import ( | |
"fmt" | |
"log" | |
"os" | |
"reflect" | |
"strconv" | |
"strings" | |
"sync" | |
"unicode/utf8" | |
) | |
type ( | |
tokenType int | |
Pos int | |
) | |
const ( | |
ILLEGAL tokenType = iota | |
END | |
NUMBER | |
OPERATOR | |
) | |
type token struct { | |
typ tokenType | |
pos Pos | |
text string | |
} | |
const ( | |
EOF rune = -1 | |
_exprRunes = "0123456789+" | |
_numberRunes = "0123456789" | |
_opRunes = "+" | |
_ignoredRunes = " " | |
_endRune = ';' | |
) | |
type ( | |
lexFn func(*lexer) lexFn | |
Operator string | |
Number int64 | |
) | |
type lexer struct { | |
start Pos | |
pos Pos | |
lastPos Pos | |
width Pos | |
input string | |
state lexFn | |
tokens chan token | |
} | |
func (l *lexer) peek() (r rune) { | |
r = l.next() | |
l.backup() | |
return | |
} | |
func (l *lexer) backup() { | |
l.pos -= l.width | |
} | |
func (l *lexer) next() (r rune) { | |
if int(l.pos) >= len(l.input) { | |
return EOF | |
} | |
r, w := utf8.DecodeRuneInString(l.input[l.pos:]) | |
l.width = Pos(w) | |
l.pos += l.width | |
return | |
} | |
func (l *lexer) ignore() { | |
l.start = l.pos | |
} | |
func (l *lexer) forget() { | |
l.pos = l.start | |
} | |
func (l *lexer) emit(t tokenType) { | |
l.tokens <- token{t, l.start, l.input[l.start:l.pos]} | |
l.start = l.pos | |
} | |
func (l *lexer) accept(valid string) (v bool) { | |
if strings.ContainsRune(valid, l.next()) { | |
return true | |
} | |
l.backup() | |
return false | |
} | |
func (l *lexer) acceptRun(valid string) { | |
for strings.ContainsRune(valid, l.next()) { | |
} | |
l.backup() | |
} | |
func (l *lexer) errorf(format string, args ...interface{}) lexFn { | |
l.tokens <- token{ILLEGAL, l.start, fmt.Sprintf(format, args...)} | |
return nil | |
} | |
func (l *lexer) shutdown() { | |
close(l.tokens) | |
} | |
func (l *lexer) tokenChan() <-chan token { | |
return l.tokens | |
} | |
func (l *lexer) run() { | |
for l.state = lexText; l.state != nil; { | |
l.state = l.state(l) | |
} | |
l.shutdown() | |
} | |
func newLexer(exp string) (l *lexer) { | |
if !strings.HasSuffix(exp, ";") { | |
exp = exp + ";" | |
} | |
return &lexer{ | |
start: Pos(0), | |
pos: Pos(0), | |
input: exp, | |
tokens: make(chan token, 3), | |
} | |
} | |
func lexText(l *lexer) (fn lexFn) { | |
// ignore meaningless runes, such as whitespaces | |
l.acceptRun(_ignoredRunes) | |
l.ignore() | |
l.acceptRun(_numberRunes) | |
if isNumber(l.input[l.start:l.pos]) { | |
l.forget() | |
return lexNumber | |
} | |
l.acceptRun(_opRunes) | |
if isOp(l.input[l.start:l.pos]) { | |
l.forget() | |
return lexOp | |
} | |
if l.next() == _endRune { | |
return nil | |
} else { | |
l.forget() | |
} | |
return l.errorf("Illegal expression `%s`, start:pos => %d:%d", l.input[l.start:], l.start, l.pos) | |
} | |
func lexNumber(l *lexer) (fn lexFn) { | |
l.acceptRun(_numberRunes) | |
word := l.input[l.start:l.pos] | |
if isNumber(word) { | |
l.emit(NUMBER) | |
} else { | |
// do nothing if word is not a valid number | |
l.forget() | |
} | |
return lexText | |
} | |
func lexOp(l *lexer) (fn lexFn) { | |
l.acceptRun(_opRunes) | |
word := l.input[l.start:l.pos] | |
if isOp(word) { | |
l.emit(OPERATOR) | |
} else { | |
// do nothing if not a valid operator | |
l.forget() | |
} | |
return lexText | |
} | |
func isNumber(text string) bool { | |
_, err := strconv.ParseInt(text, 10, 64) | |
if err != nil { | |
return false | |
} | |
return true | |
} | |
func isOp(text string) bool { | |
switch text { | |
case "+": | |
return true | |
} | |
return false | |
} | |
type expr struct { | |
left Number | |
right Number | |
op Operator | |
} | |
func (e *expr) String() string { | |
return fmt.Sprintf("Left(%d) %s Right(%d)", e.left, e.op, e.right) | |
} | |
type parseFn func(*parser) parseFn | |
type parser struct { | |
*lexer | |
state parseFn | |
buf []token | |
bufSize int | |
expr *expr | |
} | |
func (p *parser) run() { | |
for p.state = parseExpr; p.state != nil; { | |
p.state = p.state(p) | |
} | |
} | |
func (p *parser) fill() { | |
n := p.bufSize - len(p.buf) | |
for i := 0; i < n; i++ { | |
p.buf = append(p.buf, <-p.tokenChan()) | |
} | |
} | |
func (p *parser) peek() (t token) { | |
p.fill() | |
if len(p.buf) > 0 { | |
t = p.buf[0] | |
} | |
return | |
} | |
// buf is an array acts as a FIFO queue | |
func (p *parser) next() (t token) { | |
p.fill() | |
if len(p.buf) > 0 { | |
t = p.buf[0] | |
p.buf = p.buf[1:] | |
} | |
p.fill() | |
return | |
} | |
func newParser(l *lexer) *parser { | |
return &parser{ | |
lexer: l, | |
bufSize: 3, | |
} | |
} | |
func parseExpr(p *parser) parseFn { | |
t := p.peek() | |
if emptyToken(t) { | |
return nil | |
} | |
switch t.typ { | |
case NUMBER: | |
return parseNumber | |
case OPERATOR: | |
return parseOp | |
} | |
log.Fatalf("wrong format: %s", p.lexer.input) | |
return nil | |
} | |
func parseNumber(p *parser) parseFn { | |
t := p.next() | |
if emptyToken(t) { | |
log.Fatalf("bad format: %+v", t) | |
return nil | |
} | |
n, err := strconv.ParseInt(t.text, 10, 64) | |
if err != nil { | |
panic(err) | |
} | |
nextToken := p.peek() | |
if emptyToken(nextToken) { | |
if p.expr == nil { | |
log.Fatalf("right number must be followed by EOF") | |
} | |
p.expr.right = Number(n) | |
return nil | |
return nil | |
} | |
switch nextToken.typ { | |
case OPERATOR: | |
if p.expr != nil { | |
panic("wrong format") | |
} | |
p.expr = &expr{left: Number(n)} | |
return parseOp | |
default: | |
log.Fatalf("number must be followed by operator or END: %+v", nextToken) | |
} | |
return parseExpr | |
} | |
func parseOp(p *parser) parseFn { | |
t := p.next() | |
if emptyToken(t) { | |
return nil | |
} | |
if t.typ != OPERATOR { | |
log.Fatalf("wrong operator token") | |
} | |
if p.expr == nil { | |
log.Fatalf("wrong format") | |
} | |
if p.expr.op != "" { | |
log.Fatalf("operator exist, wrong format") | |
} | |
p.expr.op = Operator(t.text) | |
nextToken := p.peek() | |
if emptyToken(nextToken) { | |
return nil | |
} | |
// operator must be followed by a number | |
if nextToken.typ == NUMBER { | |
if p.expr == nil { | |
log.Fatalf("wrong format") | |
} | |
return parseNumber | |
} else { | |
log.Fatalf("an operator must be followed by a number: %+v", nextToken) | |
} | |
return parseExpr | |
} | |
func main() { | |
if len(os.Args) < 2 { | |
log.Fatalf("input a A + B to see the parsed result") | |
} | |
wg := sync.WaitGroup{} | |
l := newLexer(os.Args[1]) | |
p := newParser(l) | |
wg.Add(1) | |
go func() { | |
p.run() | |
wg.Done() | |
}() | |
go l.run() | |
wg.Wait() | |
log.Printf("expression: %+v", p.expr) | |
} | |
func emptyToken(t token) bool { | |
return reflect.DeepEqual(reflect.ValueOf(t).Interface(), reflect.Zero(reflect.TypeOf(t)).Interface()) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment