Created
November 14, 2024 05:45
-
-
Save jmsdnns/2bb7b43bfe0e44a076457a1123ca5e00 to your computer and use it in GitHub Desktop.
A simple example of an EBNF parser written in Rust, without support from external crates.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// <expression ::= <term> { ("+" | "-") <term> } | |
// <term> ::= <factor> { ("*" | "/") <factor> } | |
// <factor> ::= <number> | "(" <expression> ")" | |
// <number> ::= <digit> { <digit> } | |
// <digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | |
use std::str::FromStr; | |
#[derive(Debug, PartialEq, Clone)] | |
pub enum Token { | |
Number(i64), | |
Plus, | |
Minus, | |
Star, | |
Slash, | |
LParen, | |
RParen, | |
EOF, | |
} | |
pub struct Lexer<'a> { | |
input: &'a str, | |
pos: usize, | |
} | |
impl<'a> Lexer<'a> { | |
pub fn new(input: &'a str) -> Self { | |
Lexer { input, pos: 0 } | |
} | |
pub fn next_token(&mut self) -> Token { | |
self.skip_whitespace(); | |
if self.pos >= self.input.len() { | |
return Token::EOF; | |
} | |
let c = self.input[self.pos..].chars().nth(0).unwrap(); | |
match c { | |
'0'..='9' => self.number(), | |
'+' => { | |
self.pos += 1; | |
Token::Plus | |
} | |
'-' => { | |
self.pos += 1; | |
Token::Minus | |
} | |
'*' => { | |
self.pos += 1; | |
Token::Star | |
} | |
'/' => { | |
self.pos += 1; | |
Token::Slash | |
} | |
'(' => { | |
self.pos += 1; | |
Token::LParen | |
} | |
')' => { | |
self.pos += 1; | |
Token::RParen | |
} | |
_ => panic!("Whaaat is this? {}", c), | |
} | |
} | |
pub fn number(&mut self) -> Token { | |
let start = self.pos; | |
while self.pos < self.input.len() | |
&& self.input[self.pos..] | |
.chars() | |
.nth(0) | |
.unwrap() | |
.is_ascii_digit() | |
{ | |
self.pos += 1; | |
} | |
let num_str = &self.input[start..self.pos]; // consumes multiple digits | |
let number = i64::from_str(num_str).unwrap(); | |
Token::Number(number) | |
} | |
pub fn skip_whitespace(&mut self) { | |
while self.pos < self.input.len() | |
&& self.input[self.pos..] | |
.chars() | |
.nth(0) | |
.unwrap() | |
.is_whitespace() | |
{ | |
self.pos += 1; | |
} | |
} | |
} | |
#[derive(Debug, PartialEq, Clone)] | |
pub enum Expr { | |
Number(i64), | |
BinaryOp(Box<Expr>, Operator, Box<Expr>), | |
} | |
#[derive(Debug, PartialEq, Clone)] | |
pub enum Operator { | |
Plus, | |
Minus, | |
Star, | |
Slash, | |
} | |
pub struct Parser<'a> { | |
lexer: Lexer<'a>, | |
current: Token, | |
} | |
impl<'a> Parser<'a> { | |
pub fn new(input: &'a str) -> Self { | |
let mut lexer = Lexer::new(input); | |
let current = lexer.next_token(); | |
Parser { lexer, current } | |
} | |
fn eat(&mut self, token: Token) { | |
if self.current == token { | |
self.current = self.lexer.next_token(); | |
} else { | |
panic!("Huh? Got {:?} but expected {:?}", token, self.current); | |
} | |
} | |
pub fn parse(&mut self) -> Expr { | |
self.expression() | |
} | |
fn expression(&mut self) -> Expr { | |
let mut left = self.term(); | |
while self.current == Token::Plus || self.current == Token::Minus { | |
let op = match self.current { | |
Token::Plus => Operator::Plus, | |
Token::Minus => Operator::Minus, | |
_ => unreachable!(), | |
}; | |
self.eat(self.current.clone()); | |
let right = self.term(); | |
left = Expr::BinaryOp(Box::new(left), op, Box::new(right)); | |
} | |
left | |
} | |
fn term(&mut self) -> Expr { | |
let mut left = self.factor(); | |
while self.current == Token::Star || self.current == Token::Slash { | |
let op = match self.current { | |
Token::Star => Operator::Star, | |
Token::Slash => Operator::Slash, | |
_ => unreachable!(), | |
}; | |
self.eat(self.current.clone()); | |
let right = self.factor(); | |
left = Expr::BinaryOp(Box::new(left), op, Box::new(right)); | |
} | |
left | |
} | |
fn factor(&mut self) -> Expr { | |
match self.current { | |
Token::Number(n) => { | |
self.eat(Token::Number(n)); | |
Expr::Number(n) | |
} | |
Token::LParen => { | |
self.eat(Token::LParen); | |
let expr = self.expression(); | |
self.eat(Token::RParen); | |
expr | |
} | |
_ => panic!("Unexpected token in factor {:?}", self.current), | |
} | |
} | |
} | |
fn main() { | |
let expr = "3 + 5 * (2 - 8)"; | |
let mut parser = Parser::new(expr); | |
let ast = parser.parse(); | |
println!("{:?}", ast); | |
} | |
#[cfg(test)] | |
mod lexer { | |
use super::*; | |
#[test] | |
fn test_lexer() { | |
let expr = "3 + 5 * (2 - 8)"; | |
let mut l = Lexer::new(expr); | |
assert_eq!(Token::Number(3), l.next_token()); | |
assert_eq!(Token::Plus, l.next_token()); | |
assert_eq!(Token::Number(5), l.next_token()); | |
assert_eq!(Token::Star, l.next_token()); | |
assert_eq!(Token::LParen, l.next_token()); | |
assert_eq!(Token::Number(2), l.next_token()); | |
assert_eq!(Token::Minus, l.next_token()); | |
assert_eq!(Token::Number(8), l.next_token()); | |
assert_eq!(Token::RParen, l.next_token()); | |
} | |
#[test] | |
fn test_number() { | |
let expr = "33"; | |
let mut l = Lexer::new(expr); | |
assert_eq!(Token::Number(33), l.next_token()); | |
} | |
} | |
#[cfg(test)] | |
mod parser { | |
use super::*; | |
#[test] | |
fn test_parser() { | |
use Expr::*; | |
use Operator::*; | |
let expr = "3 + 5 * (2 - 8)"; | |
let mut p = Parser::new(expr); | |
let ast = p.parse(); | |
let expected = BinaryOp( | |
Box::new(Number(3)), | |
Plus, | |
Box::new(BinaryOp( | |
Box::new(Number(5)), | |
Star, | |
Box::new(BinaryOp(Box::new(Number(2)), Minus, Box::new(Number(8)))), | |
)), | |
); | |
assert_eq!(expected, ast); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment