Last active
July 15, 2021 23:09
-
-
Save zesterer/d2efc9663d59af1ebe3c73d0bd6c42b8 to your computer and use it in GitHub Desktop.
Lexer for pythonic indentation-aware language that converts identation to token blocks *and* correctly handles parentheses
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#[derive(Debug)] | |
enum Token { | |
Num(i64), | |
Paren(Vec<Token>), | |
Block(Vec<Token>), | |
} | |
fn lex(src: &str) -> Result<Vec<Token>, String> { | |
let mut ident_depth = 0; | |
let mut token_stack = vec![(ident_depth, Vec::new())]; | |
enum State { | |
Num, | |
} | |
let mut state = State::Num; | |
let mut symbol = String::new(); | |
let mut paren_depth = 0; | |
let mut chars = src.chars().peekable(); | |
while let Some(c) = chars.next() { | |
match c { | |
c if c.is_ascii_digit() => match &state { | |
State::Num => symbol.push(c), | |
}, | |
c => { | |
if symbol.len() > 0 { | |
match &state { | |
State::Num => if paren_depth == 0 { | |
token_stack.push((ident_depth, vec![Token::Num(symbol.parse().unwrap())])) | |
} else { | |
token_stack.last_mut().unwrap().1.push(Token::Num(symbol.parse().unwrap())) | |
}, | |
} | |
symbol.clear(); | |
} | |
match c { | |
'(' => { token_stack.push((ident_depth, Vec::new())); paren_depth += 1; }, | |
')' => { | |
let paren = Token::Paren(token_stack.pop().unwrap().1); | |
token_stack.last_mut().unwrap().1.push(paren); | |
paren_depth -= 1; | |
}, | |
'\n' if paren_depth == 0 => { | |
ident_depth = 0; | |
while let Some(_) = chars.peek().filter(|c| c.is_whitespace()) { | |
ident_depth += 1; | |
chars.next(); | |
} | |
token_stack.push((ident_depth, Vec::new())); | |
}, | |
c if c.is_whitespace() => {}, | |
c => return Err(format!("Unknown character '{}'", c)) | |
} | |
}, | |
} | |
} | |
//dbg!(&token_stack); | |
let mut tokens = vec![(0, Vec::new())]; | |
let mut current_depth = 0; | |
for (depth, toks) in token_stack { | |
if depth > current_depth { | |
tokens.push((depth, Vec::new())); | |
} else if depth < current_depth { | |
while tokens.last().unwrap().0 > depth { | |
let toks = tokens.pop().unwrap().1; | |
tokens.last_mut().unwrap().1.push(Token::Block(toks)); | |
} | |
} | |
current_depth = depth; | |
for tok in toks { | |
tokens.last_mut().unwrap().1.push(tok); | |
} | |
} | |
//dbg!(&tokens); | |
while tokens.len() > 1 { | |
let toks = tokens.pop().unwrap().1; | |
tokens.last_mut().unwrap().1.push(Token::Block(toks)); | |
} | |
//dbg!(&tokens); | |
Ok(tokens.pop().unwrap().1) | |
} | |
const SRC: &str = r#" | |
123 456 | |
41 42 | |
43 | |
(44 | |
45 | |
46 | |
) | |
47 | |
48 49 | |
"#; | |
fn main() { | |
dbg!(lex(SRC)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment