Skip to content

Instantly share code, notes, and snippets.

@swgillespie
Created April 5, 2014 21:35
Show Gist options
  • Save swgillespie/9998357 to your computer and use it in GitHub Desktop.
Save swgillespie/9998357 to your computer and use it in GitHub Desktop.
///
/// A LashToken is a token in Lash. Every token has an associated value.
///
use std::io;
use std::str;
#[deriving(Show, Eq, Clone)]
pub enum LashToken {
Identifier(~str),
Integer(i32),
Float(f64),
StringLiteral(~str),
LParen,
RParen,
LBracket,
RBracket,
LBrace,
RBrace,
Dot,
Comma,
Semicolon,
Operator(&'static str),
Let,
For,
In,
If,
Else,
Unless,
Throw,
Catch,
As,
Func,
Nil,
False,
True,
Using,
From,
Try,
Rethrow,
Finally,
Return,
EOF
}
pub struct LashLexer<'a> {
input_stream: io::BufferedReader<~io::Reader>,
unget_stack : Vec<char>,
line_no : uint,
col_no : uint
}
impl<'a> LashLexer<'a> {
pub fn new(input: ~io::Reader) -> ~LashLexer {
~LashLexer{
input_stream: io::BufferedReader::new(input),
unget_stack: Vec::new(),
line_no: 1,
col_no: 0
}
}
pub fn tokens(&mut self) -> Vec<LashToken> {
let mut vector: Vec<LashToken> = Vec::new();
let mut eof_found_yet = true;
while eof_found_yet {
let token = self.next_token();
if token == EOF {
eof_found_yet = false;
}
vector.push(token);
}
return vector;
}
fn get_char(&mut self) -> Option<char> {
self.col_no += 1;
match self.unget_stack.pop() {
Some(val) => Some(val),
None => match self.input_stream.read_char() {
Ok(val) => {
match val {
'\n' => {
self.col_no = 0;
self.line_no += 1;
Some(val)
},
'!' => self.lex_comment(),
_ => Some(val),
}
},
Err(error) => {
if error.kind == io::EndOfFile {
None
} else {
fail!("Lexer received unexpected error while reading input stream: {}", error);
}
},
},
}
}
fn unget_char(&mut self, unget: char) {
self.unget_stack.push(unget);
}
fn peek_char(&mut self) -> Option<char> {
match self.get_char() {
Some(val) => {
self.unget_char(val);
return Some(val);
},
None => None,
}
}
fn identifier_check(&mut self, string_stack: Vec<char>) -> LashToken {
// this shouldn't fail unless something is horribly wrong, in which case
// we should abort anyway
let string = str::from_chars(string_stack.as_slice());
match string.as_slice() {
// Reserved words
"let" => Let,
"for" => For,
"in" => In,
"if" => If,
"else" => Else,
"unless" => Unless,
"throw" => Throw,
"catch" => Catch,
"as" => As,
"func" => Func,
"nil" => Nil,
"false" => False,
"true" => True,
"using" => Using,
"from" => From,
"try" => Try,
"rethrow" => Rethrow,
"finally" => Finally,
"return" => Return,
// word operators
"and" => Operator("and"),
"or" => Operator("or"),
"xor" => Operator("xor"),
"not" => Operator("not"),
// if it's not any of these, it's an identifier
_ => Identifier(string.to_owned()),
}
}
fn next_token(&mut self) -> LashToken {
let mut consuming_whitespace = true;
let mut next_char = self.peek_char();
while consuming_whitespace {
if next_char.is_none() {
return EOF;
} else if next_char.unwrap().is_whitespace() {
// consume the whitespace and discard
let _ = self.get_char();
// peek at the next character
next_char = self.peek_char();
} else {
// we've got something that neither whitespace or EOF
consuming_whitespace = false;
}
}
match next_char.unwrap() {
// [a-zA-Z_]
'a'..'z' | 'A'..'Z' | '_' => self.lex_identifier(),
// [0-9]
'0'..'9' => self.lex_number(),
// "
'\"' => self.lex_string_literal(),
// if it's not any of these, it's either a delimiter,
// an operator, or it's an unrecognized character.
_ => self.lex_delimiter_or_operator(),
}
}
fn lex_identifier(&mut self) -> LashToken {
let mut accumulator: Vec<char> = Vec::new();
loop {
match self.get_char() {
Some(value) => {
if self.is_whitespace(value) || !self.is_valid_identifier_body(value) {
// push whatever we found back onto the stream
self.unget_char(value);
return self.identifier_check(accumulator);
}
accumulator.push(value);
},
None => return self.identifier_check(accumulator),
};
}
}
fn lex_delimiter_or_operator(&mut self) -> LashToken {
// we checked if this is None in next_token
let mystery_char = self.get_char().unwrap();
match mystery_char {
'(' => LParen,
')' => RParen,
'[' => LBrace,
']' => RBrace,
'{' => LBracket,
'}' => RBracket,
'.' => Dot,
',' => Comma,
';' => Semicolon,
_ => self.match_operator(mystery_char),
}
}
fn match_operator(&mut self, first_char: char) -> LashToken {
match self.get_char() {
Some(val) => {
if !self.is_whitespace(val) {
self.operator_check(str::from_chars([first_char, val]))
} else {
self.operator_check(str::from_char(first_char))
}
},
None => self.operator_check(str::from_char(first_char)),
}
}
fn operator_check(&mut self, operator: ~str) -> LashToken {
match operator.as_slice() {
"+" => Operator("+"),
"-" => Operator("-"),
"*" => Operator("*"),
"/" => Operator("/"),
"**" => Operator("**"),
"%" => Operator("%"),
"&" => Operator("&"),
"|" => Operator("|"),
"~" => Operator("~"),
"==" => Operator("=="),
"<=" => Operator("<="),
">=" => Operator(">="),
"<" => Operator("<"),
">" => Operator(">"),
"^" => Operator("^"),
_ => fail!("Unknown character encountered at line {}, col {}",
self.line_no, self.col_no),
}
}
fn lex_string_literal(&mut self) -> LashToken {
let mut accumulator : Vec<char> = Vec::new();
// we already know this is a quotation mark
let _ = self.get_char();
loop {
match self.get_char() {
Some(value) => {
match value {
'\\' => accumulator.push(self.lex_escape_sequence()),
'\"' => break,
_ => accumulator.push(value),
};
},
None => fail!("Unexpected EOF while scanning string literal"),
}
}
StringLiteral(str::from_chars(accumulator.as_slice()))
}
fn lex_escape_sequence(&mut self) -> char {
match self.get_char() {
Some(value) => {
match value {
'n' => '\n',
'r' => '\r',
't' => '\t',
'\"' => '\"',
'0'..'9' => {
self.unget_char(value);
self.lex_hex_escape()
},
_ => fail!("Unexpected escape sequence"),
}
},
None => fail!("Unexpected EOF while scanning string literal"),
}
}
fn lex_hex_escape(&mut self) -> char {
fail!("Hex escapes aren't supported yet, sorry.");
}
fn lex_number(&mut self) -> LashToken {
let mut accumulator: Vec<char> = Vec::new();
let mut is_float = false;
loop {
match self.get_char() {
Some(value) => {
if self.is_whitespace(value) {
break;
}
match value {
'0'..'9' => accumulator.push(value),
'.' => {
if is_float {
fail!("Unexpected extra . in float literal");
}
accumulator.push(value);
is_float = true;
},
// if we don't know what it is then it's probably
// not a number, so put it back onto the stream
_ => {
self.unget_char(value);
break;
}
}
},
None => break,
}
}
let string = str::from_chars(accumulator.as_slice());
return if is_float {
Float(from_str::<f64>(string).unwrap())
} else {
Integer(from_str::<i32>(string).unwrap())
}
}
fn is_valid_identifier_body(&mut self, value: char) -> bool {
match value {
// [a-zA-Z_0-9]
'a'..'z' | 'A'..'Z' | '_' | '0'..'9' => true,
_ => false
}
}
fn is_whitespace(&mut self, value: char) -> bool {
match value {
// [ \n\t\r]
' ' | '\t' | '\r' => true,
'\n' => {
self.line_no += 1;
self.col_no = 0;
true
}
_ => false
}
}
fn lex_comment(&mut self) -> Option<char> {
loop {
match self.get_char() {
Some(val) => {
if val == '\n' {
break;
}
},
None => return None,
};
}
return self.get_char();
}
}
fn main() {
let mut lexer = LashLexer::new(~io::stdio::stdin_raw());
let tokens = lexer.tokens();
println!("Tokens: {}", tokens);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment