Skip to content

Instantly share code, notes, and snippets.

@iwillspeak
Last active December 11, 2020 11:50
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iwillspeak/a8a8c0f03524d8ce6d19 to your computer and use it in GitHub Desktop.
Save iwillspeak/a8a8c0f03524d8ce6d19 to your computer and use it in GitHub Desktop.
A Microcrate containing an example hand-written lexer.
[root]
name = "lexer_example"
version = "0.0.1"
[package]
name = "lexer_example"
version = "0.0.1"
authors = ["Will Speak <will@willspeak.me>"]
[[bin]]
name = "lexer_example"
path = "main.rs"
use std::io;
use std::io::prelude::*;
#[derive(Debug,PartialEq)]
pub enum Tok {
Var(String),
Num(i32),
Plus
}
#[derive(PartialEq)]
enum State {
S, A, B, C, D
}
pub struct Tokeniser {
ts: usize,
chars: String
}
impl Tokeniser {
pub fn new_from_str(string: &str) -> Tokeniser {
let bytes: Vec<_> = string.bytes().collect();
let mut temp_reader = io::Cursor::new(bytes);
Tokeniser::new(&mut temp_reader)
}
pub fn new(reader: &mut Read) -> Tokeniser {
let mut buf = String::new();
reader.read_to_string(&mut buf).unwrap();
Tokeniser {
ts: 0,
chars: buf
}
}
fn next_match(&mut self) -> Option<Tok> {
loop {
let mut state = State::S;
let mut te = self.ts;
for c in self.chars[self.ts..].chars() {
// find the next transition in the state machine
let next = match state {
State::S => match c {
' ' | '\t' => Some(State::A),
'+' => Some(State::B),
'0'...'9' => Some(State::C),
'a'...'z' => Some(State::D),
_ => None
},
State::A => match c {
' ' | '\t' => Some(State::A),
_ => None
},
State::B => None,
State::C => match c {
'0'...'9' => Some(State::C),
_ => None
},
State::D => match c {
'a'...'z' | '0'...'9' => Some(State::D),
_ => None
}
};
// If we found a transition then consume the character
// and move to that state
if let Some(next_state) = next {
state = next_state;
te += 1;
} else {
break;
}
}
// once we can no longer match any more characters we
// decide what token to return
let token_str = &self.chars[self.ts..te];
self.ts = te;
// If we recognised some whitespace, look for the next
// token instead
if state == State::A {
continue;
}
// Depending on which state we're in we know which token
// we have just accepted
return match state {
State::B => Some(Tok::Plus),
State::C => Some(Tok::Num(token_str.parse().unwrap())),
State::D => Some(Tok::Var(token_str.to_string())),
_ => None
};
}
}
pub fn matches(&mut self) -> Vec<Tok> {
let mut v = vec![];
while let Some(tok) = self.next_match() {
v.push(tok);
}
v
}
}
#[cfg(not(test))]
fn dump_tokens(s: &str) {
println!("Tokenising: '{}'", s);
for tok in Tokeniser::new_from_str(s).matches() {
println!("{0:?}", tok);
}
}
#[cfg(not(test))]
fn main() {
dump_tokens("some tokens + 123 things");
let stdin = io::stdin();
for line in stdin.lock().lines() {
dump_tokens(line.unwrap().as_ref());
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn var_matches_returns_var() {
let matched = Tokeniser::new_from_str("some vars").matches();
assert_eq!(Tok::Var("some".to_string()), matched[0]);
assert_eq!(Tok::Var("vars".to_string()), matched[1]);
}
#[test]
fn num_matches_returns_num() {
let matched = Tokeniser::new_from_str("13337 1701 123").matches();
assert_eq!(Tok::Num(13337), matched[0]);
assert_eq!(Tok::Num(1701), matched[1]);
assert_eq!(Tok::Num(123), matched[2]);
}
#[test]
fn mixed_string_returns_tokens() {
let matched = Tokeniser::new_from_str("some + var + 1337").matches();
assert_eq!(Tok::Var("some".to_string()), matched[0]);
assert_eq!(Tok::Plus, matched[1]);
assert_eq!(Tok::Var("var".to_string()), matched[2]);
assert_eq!(Tok::Plus, matched[3]);
assert_eq!(Tok::Num(1337), matched[4]);
}
#[test]
fn without_spaces_all_identifiers_are_returned() {
let matched = Tokeniser::new_from_str("123v123").matches();
assert_eq!(Tok::Num(123), matched[0]);
assert_eq!(Tok::Var("v123".to_string()), matched[1]);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment