zesterer/python-lex.rs

## python-lex.rs
#[derive(Debug)]
enum Token {
    Num(i64),
    Paren(Vec<Token>),
    Block(Vec<Token>),
}

fn lex(src: &str) -> Result<Vec<Token>, String> {
    let mut ident_depth = 0;
    let mut token_stack = vec![(ident_depth, Vec::new())];

    enum State {
        Num,
    }

    let mut state = State::Num;
    let mut symbol = String::new();
    let mut paren_depth = 0;
    let mut chars = src.chars().peekable();
    while let Some(c) = chars.next() {
        match c {
            c if c.is_ascii_digit() => match &state {
                State::Num => symbol.push(c),
            },
            c => {
                if symbol.len() > 0 {
                    match &state {
                        State::Num => if paren_depth == 0 {
                            token_stack.push((ident_depth, vec![Token::Num(symbol.parse().unwrap())]))
                        } else {
                            token_stack.last_mut().unwrap().1.push(Token::Num(symbol.parse().unwrap()))
                        },
                    }
                    symbol.clear();
                }
                match c {
                    '(' => { token_stack.push((ident_depth, Vec::new())); paren_depth += 1; },
                    ')' => {
                        let paren = Token::Paren(token_stack.pop().unwrap().1);
                        token_stack.last_mut().unwrap().1.push(paren);
                        paren_depth -= 1;
                    },
                    '\n' if paren_depth == 0 => {
                        ident_depth = 0;
                        while let Some(_) = chars.peek().filter(|c| c.is_whitespace()) {
                            ident_depth += 1;
                            chars.next();
                        }
                        token_stack.push((ident_depth, Vec::new()));
                    },
                    c if c.is_whitespace() => {},
                    c => return Err(format!("Unknown character '{}'", c))
                }
            },
        }
    }

    //dbg!(&token_stack);

    let mut tokens = vec![(0, Vec::new())];
    let mut current_depth = 0;
    for (depth, toks) in token_stack {
        if depth > current_depth {
            tokens.push((depth, Vec::new()));
        } else if depth < current_depth {
            while tokens.last().unwrap().0 > depth {
                let toks = tokens.pop().unwrap().1;
                tokens.last_mut().unwrap().1.push(Token::Block(toks));
            }
        }
        current_depth = depth;
        for tok in toks {
            tokens.last_mut().unwrap().1.push(tok);
        }
    }

    //dbg!(&tokens);

    while tokens.len() > 1 {
        let toks = tokens.pop().unwrap().1;
        tokens.last_mut().unwrap().1.push(Token::Block(toks));
    }

    //dbg!(&tokens);

    Ok(tokens.pop().unwrap().1)
}

const SRC: &str = r#"
123 456
    41 42
    43
        (44
            45
             46
        )
    47
48 49
"#;

fn main() {
    dbg!(lex(SRC));
}
	#[derive(Debug)]
	enum Token {
	Num(i64),
	Paren(Vec<Token>),
	Block(Vec<Token>),
	}

	fn lex(src: &str) -> Result<Vec<Token>, String> {
	let mut ident_depth = 0;
	let mut token_stack = vec![(ident_depth, Vec::new())];

	enum State {
	Num,
	}

	let mut state = State::Num;
	let mut symbol = String::new();
	let mut paren_depth = 0;
	let mut chars = src.chars().peekable();
	while let Some(c) = chars.next() {
	match c {
	c if c.is_ascii_digit() => match &state {
	State::Num => symbol.push(c),
	},
	c => {
	if symbol.len() > 0 {
	match &state {
	State::Num => if paren_depth == 0 {
	token_stack.push((ident_depth, vec![Token::Num(symbol.parse().unwrap())]))
	} else {
	token_stack.last_mut().unwrap().1.push(Token::Num(symbol.parse().unwrap()))
	},
	}
	symbol.clear();
	}
	match c {
	'(' => { token_stack.push((ident_depth, Vec::new())); paren_depth += 1; },
	')' => {
	let paren = Token::Paren(token_stack.pop().unwrap().1);
	token_stack.last_mut().unwrap().1.push(paren);
	paren_depth -= 1;
	},
	'\n' if paren_depth == 0 => {
	ident_depth = 0;
	while let Some(_) = chars.peek().filter(\|c\| c.is_whitespace()) {
	ident_depth += 1;
	chars.next();
	}
	token_stack.push((ident_depth, Vec::new()));
	},
	c if c.is_whitespace() => {},
	c => return Err(format!("Unknown character '{}'", c))
	}
	},
	}
	}

	//dbg!(&token_stack);

	let mut tokens = vec![(0, Vec::new())];
	let mut current_depth = 0;
	for (depth, toks) in token_stack {
	if depth > current_depth {
	tokens.push((depth, Vec::new()));
	} else if depth < current_depth {
	while tokens.last().unwrap().0 > depth {
	let toks = tokens.pop().unwrap().1;
	tokens.last_mut().unwrap().1.push(Token::Block(toks));
	}
	}
	current_depth = depth;
	for tok in toks {
	tokens.last_mut().unwrap().1.push(tok);
	}
	}

	//dbg!(&tokens);

	while tokens.len() > 1 {
	let toks = tokens.pop().unwrap().1;
	tokens.last_mut().unwrap().1.push(Token::Block(toks));
	}

	//dbg!(&tokens);

	Ok(tokens.pop().unwrap().1)
	}

	const SRC: &str = r#"
	123 456
	41 42
	43
	(44
	45
	46
	)
	47
	48 49
	"#;

	fn main() {
	dbg!(lex(SRC));
	}