Created
April 5, 2014 21:35
-
-
Save swgillespie/9998357 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// | |
/// A LashToken is a token in Lash. Every token has an associated value. | |
/// | |
use std::io; | |
use std::str; | |
#[deriving(Show, Eq, Clone)] | |
pub enum LashToken { | |
Identifier(~str), | |
Integer(i32), | |
Float(f64), | |
StringLiteral(~str), | |
LParen, | |
RParen, | |
LBracket, | |
RBracket, | |
LBrace, | |
RBrace, | |
Dot, | |
Comma, | |
Semicolon, | |
Operator(&'static str), | |
Let, | |
For, | |
In, | |
If, | |
Else, | |
Unless, | |
Throw, | |
Catch, | |
As, | |
Func, | |
Nil, | |
False, | |
True, | |
Using, | |
From, | |
Try, | |
Rethrow, | |
Finally, | |
Return, | |
EOF | |
} | |
pub struct LashLexer<'a> { | |
input_stream: io::BufferedReader<~io::Reader>, | |
unget_stack : Vec<char>, | |
line_no : uint, | |
col_no : uint | |
} | |
impl<'a> LashLexer<'a> { | |
pub fn new(input: ~io::Reader) -> ~LashLexer { | |
~LashLexer{ | |
input_stream: io::BufferedReader::new(input), | |
unget_stack: Vec::new(), | |
line_no: 1, | |
col_no: 0 | |
} | |
} | |
pub fn tokens(&mut self) -> Vec<LashToken> { | |
let mut vector: Vec<LashToken> = Vec::new(); | |
let mut eof_found_yet = true; | |
while eof_found_yet { | |
let token = self.next_token(); | |
if token == EOF { | |
eof_found_yet = false; | |
} | |
vector.push(token); | |
} | |
return vector; | |
} | |
fn get_char(&mut self) -> Option<char> { | |
self.col_no += 1; | |
match self.unget_stack.pop() { | |
Some(val) => Some(val), | |
None => match self.input_stream.read_char() { | |
Ok(val) => { | |
match val { | |
'\n' => { | |
self.col_no = 0; | |
self.line_no += 1; | |
Some(val) | |
}, | |
'!' => self.lex_comment(), | |
_ => Some(val), | |
} | |
}, | |
Err(error) => { | |
if error.kind == io::EndOfFile { | |
None | |
} else { | |
fail!("Lexer received unexpected error while reading input stream: {}", error); | |
} | |
}, | |
}, | |
} | |
} | |
fn unget_char(&mut self, unget: char) { | |
self.unget_stack.push(unget); | |
} | |
fn peek_char(&mut self) -> Option<char> { | |
match self.get_char() { | |
Some(val) => { | |
self.unget_char(val); | |
return Some(val); | |
}, | |
None => None, | |
} | |
} | |
fn identifier_check(&mut self, string_stack: Vec<char>) -> LashToken { | |
// this shouldn't fail unless something is horribly wrong, in which case | |
// we should abort anyway | |
let string = str::from_chars(string_stack.as_slice()); | |
match string.as_slice() { | |
// Reserved words | |
"let" => Let, | |
"for" => For, | |
"in" => In, | |
"if" => If, | |
"else" => Else, | |
"unless" => Unless, | |
"throw" => Throw, | |
"catch" => Catch, | |
"as" => As, | |
"func" => Func, | |
"nil" => Nil, | |
"false" => False, | |
"true" => True, | |
"using" => Using, | |
"from" => From, | |
"try" => Try, | |
"rethrow" => Rethrow, | |
"finally" => Finally, | |
"return" => Return, | |
// word operators | |
"and" => Operator("and"), | |
"or" => Operator("or"), | |
"xor" => Operator("xor"), | |
"not" => Operator("not"), | |
// if it's not any of these, it's an identifier | |
_ => Identifier(string.to_owned()), | |
} | |
} | |
fn next_token(&mut self) -> LashToken { | |
let mut consuming_whitespace = true; | |
let mut next_char = self.peek_char(); | |
while consuming_whitespace { | |
if next_char.is_none() { | |
return EOF; | |
} else if next_char.unwrap().is_whitespace() { | |
// consume the whitespace and discard | |
let _ = self.get_char(); | |
// peek at the next character | |
next_char = self.peek_char(); | |
} else { | |
// we've got something that neither whitespace or EOF | |
consuming_whitespace = false; | |
} | |
} | |
match next_char.unwrap() { | |
// [a-zA-Z_] | |
'a'..'z' | 'A'..'Z' | '_' => self.lex_identifier(), | |
// [0-9] | |
'0'..'9' => self.lex_number(), | |
// " | |
'\"' => self.lex_string_literal(), | |
// if it's not any of these, it's either a delimiter, | |
// an operator, or it's an unrecognized character. | |
_ => self.lex_delimiter_or_operator(), | |
} | |
} | |
fn lex_identifier(&mut self) -> LashToken { | |
let mut accumulator: Vec<char> = Vec::new(); | |
loop { | |
match self.get_char() { | |
Some(value) => { | |
if self.is_whitespace(value) || !self.is_valid_identifier_body(value) { | |
// push whatever we found back onto the stream | |
self.unget_char(value); | |
return self.identifier_check(accumulator); | |
} | |
accumulator.push(value); | |
}, | |
None => return self.identifier_check(accumulator), | |
}; | |
} | |
} | |
fn lex_delimiter_or_operator(&mut self) -> LashToken { | |
// we checked if this is None in next_token | |
let mystery_char = self.get_char().unwrap(); | |
match mystery_char { | |
'(' => LParen, | |
')' => RParen, | |
'[' => LBrace, | |
']' => RBrace, | |
'{' => LBracket, | |
'}' => RBracket, | |
'.' => Dot, | |
',' => Comma, | |
';' => Semicolon, | |
_ => self.match_operator(mystery_char), | |
} | |
} | |
fn match_operator(&mut self, first_char: char) -> LashToken { | |
match self.get_char() { | |
Some(val) => { | |
if !self.is_whitespace(val) { | |
self.operator_check(str::from_chars([first_char, val])) | |
} else { | |
self.operator_check(str::from_char(first_char)) | |
} | |
}, | |
None => self.operator_check(str::from_char(first_char)), | |
} | |
} | |
fn operator_check(&mut self, operator: ~str) -> LashToken { | |
match operator.as_slice() { | |
"+" => Operator("+"), | |
"-" => Operator("-"), | |
"*" => Operator("*"), | |
"/" => Operator("/"), | |
"**" => Operator("**"), | |
"%" => Operator("%"), | |
"&" => Operator("&"), | |
"|" => Operator("|"), | |
"~" => Operator("~"), | |
"==" => Operator("=="), | |
"<=" => Operator("<="), | |
">=" => Operator(">="), | |
"<" => Operator("<"), | |
">" => Operator(">"), | |
"^" => Operator("^"), | |
_ => fail!("Unknown character encountered at line {}, col {}", | |
self.line_no, self.col_no), | |
} | |
} | |
fn lex_string_literal(&mut self) -> LashToken { | |
let mut accumulator : Vec<char> = Vec::new(); | |
// we already know this is a quotation mark | |
let _ = self.get_char(); | |
loop { | |
match self.get_char() { | |
Some(value) => { | |
match value { | |
'\\' => accumulator.push(self.lex_escape_sequence()), | |
'\"' => break, | |
_ => accumulator.push(value), | |
}; | |
}, | |
None => fail!("Unexpected EOF while scanning string literal"), | |
} | |
} | |
StringLiteral(str::from_chars(accumulator.as_slice())) | |
} | |
fn lex_escape_sequence(&mut self) -> char { | |
match self.get_char() { | |
Some(value) => { | |
match value { | |
'n' => '\n', | |
'r' => '\r', | |
't' => '\t', | |
'\"' => '\"', | |
'0'..'9' => { | |
self.unget_char(value); | |
self.lex_hex_escape() | |
}, | |
_ => fail!("Unexpected escape sequence"), | |
} | |
}, | |
None => fail!("Unexpected EOF while scanning string literal"), | |
} | |
} | |
fn lex_hex_escape(&mut self) -> char { | |
fail!("Hex escapes aren't supported yet, sorry."); | |
} | |
fn lex_number(&mut self) -> LashToken { | |
let mut accumulator: Vec<char> = Vec::new(); | |
let mut is_float = false; | |
loop { | |
match self.get_char() { | |
Some(value) => { | |
if self.is_whitespace(value) { | |
break; | |
} | |
match value { | |
'0'..'9' => accumulator.push(value), | |
'.' => { | |
if is_float { | |
fail!("Unexpected extra . in float literal"); | |
} | |
accumulator.push(value); | |
is_float = true; | |
}, | |
// if we don't know what it is then it's probably | |
// not a number, so put it back onto the stream | |
_ => { | |
self.unget_char(value); | |
break; | |
} | |
} | |
}, | |
None => break, | |
} | |
} | |
let string = str::from_chars(accumulator.as_slice()); | |
return if is_float { | |
Float(from_str::<f64>(string).unwrap()) | |
} else { | |
Integer(from_str::<i32>(string).unwrap()) | |
} | |
} | |
fn is_valid_identifier_body(&mut self, value: char) -> bool { | |
match value { | |
// [a-zA-Z_0-9] | |
'a'..'z' | 'A'..'Z' | '_' | '0'..'9' => true, | |
_ => false | |
} | |
} | |
fn is_whitespace(&mut self, value: char) -> bool { | |
match value { | |
// [ \n\t\r] | |
' ' | '\t' | '\r' => true, | |
'\n' => { | |
self.line_no += 1; | |
self.col_no = 0; | |
true | |
} | |
_ => false | |
} | |
} | |
fn lex_comment(&mut self) -> Option<char> { | |
loop { | |
match self.get_char() { | |
Some(val) => { | |
if val == '\n' { | |
break; | |
} | |
}, | |
None => return None, | |
}; | |
} | |
return self.get_char(); | |
} | |
} | |
fn main() { | |
let mut lexer = LashLexer::new(~io::stdio::stdin_raw()); | |
let tokens = lexer.tokens(); | |
println!("Tokens: {}", tokens); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment