Created
April 5, 2014 19:04
-
-
Save swgillespie/9996431 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// | |
/// A LashToken is a token in Lash. Every token has an associated value. | |
/// | |
use std::io; | |
use std::str; | |
#[deriving(Show, Eq, Clone)] | |
pub enum LashToken { | |
Identifier(~str), | |
Integer(i32), | |
Float(f64), | |
StringLiteral(~str), | |
LParen, | |
RParen, | |
LBracket, | |
RBracket, | |
LBrace, | |
RBrace, | |
Dot, | |
Comma, | |
Operator(~str), | |
Let, | |
For, | |
In, | |
If, | |
Else, | |
Unless, | |
Throw, | |
Catch, | |
As, | |
Func, | |
Nil, | |
False, | |
True, | |
Using, | |
From, | |
Try, | |
Rethrow, | |
Finally, | |
EOF | |
} | |
pub struct LashLexer<'a, T> { | |
input_stream: io::BufferedReader<T>, | |
unget_stack : Vec<char>, | |
line_no : uint, | |
col_no : uint | |
} | |
impl<'a, T: Reader> LashLexer<'a, T> { | |
pub fn new<T: io::Reader>(input: &'a T) -> ~LashLexer<T> { | |
~LashLexer{ | |
input_stream: io::BufferedReader::new(input), | |
unget_stack: Vec::new(), | |
line_no: 0, | |
col_no: 0 | |
} | |
} | |
pub fn tokens(&mut self) -> Vec<LashToken> { | |
let mut vector: Vec<LashToken> = Vec::new(); | |
let mut eof_found_yet = true; | |
while eof_found_yet { | |
let token = self.next_token(); | |
if token == EOF { | |
eof_found_yet = false; | |
} | |
vector.push(token); | |
} | |
vector.reverse(); | |
return vector; | |
} | |
fn get_char(&mut self) -> Option<char> { | |
self.col_no += 1; | |
match self.unget_stack.pop() { | |
Some(val) => Some(val), | |
None => match self.input_stream.read_char() { | |
Ok(val) => Some(val), | |
Err(error) => { | |
if error.kind == io::EndOfFile { | |
return None | |
} else { | |
fail!("Lexer received unexpected error while reading input stream: {}", error); | |
} | |
}, | |
}, | |
} | |
} | |
fn unget_char(&mut self, unget: char) { | |
self.unget_stack.push(unget); | |
} | |
fn peek_char(&mut self) -> Option<char> { | |
match self.get_char() { | |
Some(val) => { | |
self.unget_char(val); | |
return Some(val); | |
}, | |
None => None, | |
} | |
} | |
fn identifier_check(&mut self, mut string_stack: Vec<char>) -> LashToken { | |
// we've been putting stuff onto the top of this stack, so we'll | |
// need to reverse it to get a normal right-to-left order | |
string_stack.reverse(); | |
// this shouldn't fail unless something is horribly wrong, in which case | |
// we should abort anyway | |
let string = str::from_chars(string_stack.as_slice()); | |
match string.as_slice() { | |
// Reserved words | |
"let" => Let, | |
"for" => For, | |
"in" => In, | |
"if" => If, | |
"else" => Else, | |
"unless" => Unless, | |
"throw" => Throw, | |
"catch" => Catch, | |
"as" => As, | |
"func" => Func, | |
"nil" => Nil, | |
"false" => False, | |
"true" => True, | |
"using" => Using, | |
"from" => From, | |
"try" => Try, | |
"rethrow" => Rethrow, | |
"finally" => Finally, | |
// word operators | |
"and" => Operator(~"and"), | |
"or" => Operator(~"or"), | |
"xor" => Operator(~"xor"), | |
"not" => Operator(~"not"), | |
// if it's not any of these, it's an identifier | |
_ => Identifier(string.to_owned()), | |
} | |
} | |
fn next_token(&mut self) -> LashToken { | |
let next_char = self.peek_char(); | |
if next_char.is_none() { | |
return EOF; | |
} | |
match next_char.unwrap() { | |
// [a-zA-Z_] | |
'a'..'z' | 'A'..'Z' | '_' => self.lex_identifier(), | |
// [0-9] | |
'0'..'9' => self.lex_number(), | |
// " | |
'\"' => self.lex_string_literal(), | |
// if it's not any of these, it's either a delimiter, | |
// an operator, or it's an unrecognized character. | |
_ => self.lex_delimiter_or_operator(), | |
} | |
} | |
fn lex_identifier(&mut self) -> LashToken { | |
let mut accumulator: Vec<char> = Vec::new(); | |
loop { | |
match self.get_char() { | |
Some(value) => { | |
if !self.is_valid_identifier_body(value) { | |
fail!("Unrecognized character: {}", value); | |
} | |
if self.is_whitespace(value) { | |
return self.identifier_check(accumulator); | |
} | |
accumulator.push(value); | |
}, | |
None => return self.identifier_check(accumulator), | |
}; | |
} | |
} | |
fn lex_delimiter_or_operator(&mut self) -> LashToken { | |
// we checked if this is None in next_token | |
let mystery_char = self.get_char().unwrap(); | |
match mystery_char { | |
'(' => LParen, | |
')' => RParen, | |
'[' => LBrace, | |
']' => RBrace, | |
'{' => LBracket, | |
'}' => RBracket, | |
'.' => Dot, | |
',' => Comma, | |
_ => self.match_operator(mystery_char), | |
} | |
} | |
fn match_operator(&mut self, first_char: char) -> LashToken { | |
let operator_string = match self.get_char() { | |
Some(val) => str::from_chars([first_char, val]), | |
None => str::from_char(first_char), | |
}; | |
Operator(operator_string.to_owned()) | |
} | |
fn lex_string_literal(&mut self) -> LashToken { | |
Dot | |
} | |
fn lex_number(&mut self) -> LashToken { | |
let mut accumulator: Vec<char> = Vec::new(); | |
let mut is_float = false; | |
loop { | |
match self.get_char() { | |
Some(value) => { | |
if self.is_whitespace(value) { | |
break; | |
} | |
match value { | |
'0'..'9' => accumulator.push(value), | |
'.' => { | |
if is_float { | |
fail!("Unexpected extra . in float literal"); | |
} | |
accumulator.push(value); | |
is_float = true; | |
}, | |
_ => fail!("Unexpected character in number literal: {}, value"), | |
} | |
}, | |
None => break, | |
} | |
} | |
let string = str::from_chars(accumulator.as_slice()); | |
return if is_float { | |
Float(from_str::<f64>(string).unwrap()) | |
} else { | |
Integer(from_str::<i32>(string).unwrap()) | |
} | |
} | |
fn is_valid_identifier_body(&mut self, value: char) -> bool { | |
match value { | |
// [a-zA-Z_0-9] | |
'a'..'z' | 'A'..'Z' | '_' | '0'..'9' => true, | |
_ => false | |
} | |
} | |
fn is_whitespace(&mut self, value: char) -> bool { | |
match value { | |
// [ \n\t\r] | |
' ' | '\t' | '\r' => true, | |
'\n' => { | |
self.line_no += 1; | |
self.col_no = 0; | |
true | |
} | |
_ => false | |
} | |
} | |
} | |
fn main() { | |
let lexer: LashLexer<io::stdio::StdReader> = LashLexer::new(io::stdio::stdin_raw()); | |
let tokens = lexer.tokens(); | |
println!("Tokens: {}", tokens); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment