Skip to content

Instantly share code, notes, and snippets.

@mk2
Last active January 5, 2018 13:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mk2/377346c4e24c7a5cccde to your computer and use it in GitHub Desktop.
Save mk2/377346c4e24c7a5cccde to your computer and use it in GitHub Desktop.
//! Tokenizer related module
//!
//!
use super::token::{Token, Kind, Place, LineNum, Pos, Length};
use std::str::Chars;
#[allow(dead_code)]
pub struct Tokenizer<'a> {
pub lines : &'a str,
next_char : Option<char>,
chars : Chars<'a>,
linen : LineNum,
pos : Pos,
readchar : char,
readbuf : String,
}
#[allow(dead_code)]
impl<'a> Tokenizer<'a> {
pub fn from_str(raw_str: &str) -> Tokenizer {
Tokenizer{
lines : raw_str,
next_char : None,
chars : raw_str.chars().clone(),
readchar : ' ',
readbuf : String::new(),
linen : 0,
pos : 0,
}
}
pub fn next_token(&mut self) -> Result<Token, String> {
self.readbuf.clear();
let mut res = Err("not found regular token".to_string());
while let Some(c) = self.next_significant_char() {
self.readbuf.push(c);
match c {
'(' | ')' | '{' | '}' | '[' | ',' | '/' | '%' | '^' | ';' | '\n' | '\r' | '#' => {
res = self.get_single_keyword_token();
break;
},
'.' | '-' | '+' | '*' | '=' | '<' | '>' | ']' | '!' | '?' => {
res = self.get_plural_keyword_token();
break;
},
'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => {
res = self.get_number_literal();
break;
},
'"' | '`' => {
res = self.get_string_literal(c);
break;
},
_ => {
res = self.get_ident_token();
break;
}
}
}
return res;
}
fn next_significant_char(&mut self) -> Option<char> {
let mut next_char = None;
if let Some(c) = self.skip_whitespaces() {
self.pos += 1;
next_char = Some(c);
}
return next_char;
}
fn skip_whitespaces(&mut self) -> Option<char> {
let mut last_char = None;
while let Some(c) = self.next_char() {
self.pos += 1;
if c.is_whitespace() || c == '\t' {
continue;
} else {
last_char = Some(c);
break;
}
}
return last_char;
}
fn next_char(&mut self) -> Option<char> {
if let Some(c) = self.next_char {
self.next_char = None;
return Some(c);
}
return self.chars.next();
}
fn get_ident_token(&mut self) -> Result<Token, String> {
let lin = self.linen;
let pos = self.pos;
while let Some(c) = self.chars.next() {
if !c.is_alphanumeric() {
self.next_char = Some(c);
break;
}
self.readbuf.push(c);
continue;
}
let len = self.readbuf.len() as Length;
let kind = match self.readbuf.as_ref() {
"true" => Kind::True,
"false" => Kind::False,
"fun" => Kind::Fun,
"if" => Kind::If,
"else" => Kind::Else,
"elseif" => Kind::ElseIf,
"nil" => Kind::Nil,
"do" => Kind::Do,
"let" => Kind::Let,
"in" => Kind::In,
"repeat" => Kind::Repeat,
"while" => Kind::While,
"for" => Kind::For,
"return" => Kind::Return,
"break" => Kind::Break,
"end" => Kind::End,
"and" => Kind::And,
"not" => Kind::Not,
"or" => Kind::Or,
_ => Kind::Ident,
};
let place = Place::with(lin, pos, len);
if match_kind!(kind,Kind::Ident) {
Ok(Token::with_kind_value(kind, place, self.readbuf.clone()))
} else {
Ok(Token::with_kind(kind, place))
}
}
fn get_single_keyword_token(&mut self) -> Result<Token, String> {
let lin = self.linen;
let pos = self.pos;
let len = 1;
let kind = match self.readbuf.as_ref() {
"(" => Kind::Lpar,
")" => Kind::Rpar,
"{" => Kind::Lbrac,
"}" => Kind::Rbrac,
"[" => Kind::Lsqua,
"," => Kind::Comma,
"^" => Kind::Pow,
"%" => Kind::Remain,
"/" => Kind::Divide,
"#" => Kind::ArrLen,
"\n" => Kind::EOL,
"\r" => Kind::EOL,
";" => Kind::EOL,
_ => Kind::Unknown,
};
let place = Place{
linen : lin,
pos : pos,
len : len,
};
if match_kind!(kind,Kind::EOL) {
self.linen += 1;
}
if match_kind!(kind,Kind::Unknown) {
Ok(Token::with_kind_value(kind, place, self.readbuf.clone()))
} else {
Ok(Token::with_kind(kind, place))
}
}
fn get_plural_keyword_token(&mut self) -> Result<Token, String> {
let lin = self.linen;
let pos = self.pos;
while let Some(c) = self.chars.next() {
if c.is_alphanumeric() || c.is_whitespace() {
self.next_char = Some(c);
break;
}
self.readbuf.push(c);
continue;
}
let len = self.readbuf.len() as Length;
let kind = match self.readbuf.as_ref() {
"+" => Kind::Plus,
"<" => Kind::Less,
"<=" => Kind::LessEqual,
">" => Kind::Greater,
">=" => Kind::GreaterEqual,
"==" => Kind::Equal,
"!=" => Kind::NotEqual,
":" => Kind::Dcolon,
":=" => Kind::Assign,
"?:=" => Kind::ElvisAssign,
"." => Kind::Sdot,
".." => Kind::Ddot,
"..." => Kind::Tdot,
"--" => Kind::LineComment,
"--[[" => Kind::BlkCommentStart,
"]]" => Kind::BlkCommentEnd,
"--[[]]" => Kind::EmptyComment,
"]" => Kind::Rsqua,
_ => Kind::Unknown,
};
let place = Place{
linen : lin,
pos : pos,
len : len,
};
if match_kind!(kind,Kind::Unknown) {
Ok(Token::with_kind_value(kind, place, self.readbuf.clone()))
} else {
Ok(Token::with_kind(kind, place))
}
}
fn get_number_literal(&mut self) -> Result<Token, String> {
let lin = self.linen;
let mut len = 1;
while let Some(c) = self.chars.next() {
match c {
'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => {
self.readbuf.push(c);
continue;
}
_ => {
self.next_char = Some(c);
break;
}
}
}
len = self.readbuf.len() as Length;
let token = Token::with_kind_value(Kind::Nlit, Place{linen:lin,pos:self.pos,len:len}, self.readbuf.clone());
return Ok(token);
}
fn get_string_literal(&mut self, begin_char: char) -> Result<Token, String> {
let lin = self.linen;
let pos = self.pos;
let mut res = Err("not valid string literal".to_string());
self.readbuf.clear();
while let Some(c) = self.chars.next() {
match c {
'"' if c == begin_char => {
let len = self.readbuf.len() as Length;
let token = Token::with_kind_value(Kind::Slit, Place::with(lin,pos,len), self.readbuf.clone());
res = Ok(token);
break;
},
'`' if c == begin_char => {
let len = self.readbuf.len() as Length;
let token = Token::with_kind_value(Kind::Slit, Place::with(lin,pos,len), self.readbuf.clone());
res = Ok(token);
break;
},
_ => {
self.readbuf.push(c);
},
}
}
return res;
}
}
impl<'a> Iterator for Tokenizer<'a> {
type Item = Result<Token, String>;
fn next(&mut self) -> Option<Result<Token, String>> {
let result = self.next_token();
return Some(result);
}
}
#[cfg(test)]
mod tests {
use super::Tokenizer;
//#[test]
fn new_tokenizer() {
let mut tokenizer = Tokenizer::from_str(" test 10 \t10000; --[[]] fun if for 1+2 [false,true] --[[ test ]] `test test`");
while let Some(result) = tokenizer.next() {
match result {
Ok(token) => println!("token: {:?}", token),
Err(_) => break,
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment