Skip to content

Instantly share code, notes, and snippets.

@swgillespie
Created April 5, 2014 19:04
Show Gist options
  • Save swgillespie/9996431 to your computer and use it in GitHub Desktop.
Save swgillespie/9996431 to your computer and use it in GitHub Desktop.
///
/// A LashToken is a token in Lash. Every token has an associated value.
///
use std::io;
use std::str;
#[deriving(Show, Eq, Clone)]
pub enum LashToken {
Identifier(~str),
Integer(i32),
Float(f64),
StringLiteral(~str),
LParen,
RParen,
LBracket,
RBracket,
LBrace,
RBrace,
Dot,
Comma,
Operator(~str),
Let,
For,
In,
If,
Else,
Unless,
Throw,
Catch,
As,
Func,
Nil,
False,
True,
Using,
From,
Try,
Rethrow,
Finally,
EOF
}
pub struct LashLexer<'a, T> {
input_stream: io::BufferedReader<T>,
unget_stack : Vec<char>,
line_no : uint,
col_no : uint
}
impl<'a, T: Reader> LashLexer<'a, T> {
pub fn new<T: io::Reader>(input: &'a T) -> ~LashLexer<T> {
~LashLexer{
input_stream: io::BufferedReader::new(input),
unget_stack: Vec::new(),
line_no: 0,
col_no: 0
}
}
pub fn tokens(&mut self) -> Vec<LashToken> {
let mut vector: Vec<LashToken> = Vec::new();
let mut eof_found_yet = true;
while eof_found_yet {
let token = self.next_token();
if token == EOF {
eof_found_yet = false;
}
vector.push(token);
}
vector.reverse();
return vector;
}
fn get_char(&mut self) -> Option<char> {
self.col_no += 1;
match self.unget_stack.pop() {
Some(val) => Some(val),
None => match self.input_stream.read_char() {
Ok(val) => Some(val),
Err(error) => {
if error.kind == io::EndOfFile {
return None
} else {
fail!("Lexer received unexpected error while reading input stream: {}", error);
}
},
},
}
}
fn unget_char(&mut self, unget: char) {
self.unget_stack.push(unget);
}
fn peek_char(&mut self) -> Option<char> {
match self.get_char() {
Some(val) => {
self.unget_char(val);
return Some(val);
},
None => None,
}
}
fn identifier_check(&mut self, mut string_stack: Vec<char>) -> LashToken {
// we've been putting stuff onto the top of this stack, so we'll
// need to reverse it to get a normal right-to-left order
string_stack.reverse();
// this shouldn't fail unless something is horribly wrong, in which case
// we should abort anyway
let string = str::from_chars(string_stack.as_slice());
match string.as_slice() {
// Reserved words
"let" => Let,
"for" => For,
"in" => In,
"if" => If,
"else" => Else,
"unless" => Unless,
"throw" => Throw,
"catch" => Catch,
"as" => As,
"func" => Func,
"nil" => Nil,
"false" => False,
"true" => True,
"using" => Using,
"from" => From,
"try" => Try,
"rethrow" => Rethrow,
"finally" => Finally,
// word operators
"and" => Operator(~"and"),
"or" => Operator(~"or"),
"xor" => Operator(~"xor"),
"not" => Operator(~"not"),
// if it's not any of these, it's an identifier
_ => Identifier(string.to_owned()),
}
}
fn next_token(&mut self) -> LashToken {
let next_char = self.peek_char();
if next_char.is_none() {
return EOF;
}
match next_char.unwrap() {
// [a-zA-Z_]
'a'..'z' | 'A'..'Z' | '_' => self.lex_identifier(),
// [0-9]
'0'..'9' => self.lex_number(),
// "
'\"' => self.lex_string_literal(),
// if it's not any of these, it's either a delimiter,
// an operator, or it's an unrecognized character.
_ => self.lex_delimiter_or_operator(),
}
}
fn lex_identifier(&mut self) -> LashToken {
let mut accumulator: Vec<char> = Vec::new();
loop {
match self.get_char() {
Some(value) => {
if !self.is_valid_identifier_body(value) {
fail!("Unrecognized character: {}", value);
}
if self.is_whitespace(value) {
return self.identifier_check(accumulator);
}
accumulator.push(value);
},
None => return self.identifier_check(accumulator),
};
}
}
fn lex_delimiter_or_operator(&mut self) -> LashToken {
// we checked if this is None in next_token
let mystery_char = self.get_char().unwrap();
match mystery_char {
'(' => LParen,
')' => RParen,
'[' => LBrace,
']' => RBrace,
'{' => LBracket,
'}' => RBracket,
'.' => Dot,
',' => Comma,
_ => self.match_operator(mystery_char),
}
}
fn match_operator(&mut self, first_char: char) -> LashToken {
let operator_string = match self.get_char() {
Some(val) => str::from_chars([first_char, val]),
None => str::from_char(first_char),
};
Operator(operator_string.to_owned())
}
fn lex_string_literal(&mut self) -> LashToken {
Dot
}
fn lex_number(&mut self) -> LashToken {
let mut accumulator: Vec<char> = Vec::new();
let mut is_float = false;
loop {
match self.get_char() {
Some(value) => {
if self.is_whitespace(value) {
break;
}
match value {
'0'..'9' => accumulator.push(value),
'.' => {
if is_float {
fail!("Unexpected extra . in float literal");
}
accumulator.push(value);
is_float = true;
},
_ => fail!("Unexpected character in number literal: {}, value"),
}
},
None => break,
}
}
let string = str::from_chars(accumulator.as_slice());
return if is_float {
Float(from_str::<f64>(string).unwrap())
} else {
Integer(from_str::<i32>(string).unwrap())
}
}
fn is_valid_identifier_body(&mut self, value: char) -> bool {
match value {
// [a-zA-Z_0-9]
'a'..'z' | 'A'..'Z' | '_' | '0'..'9' => true,
_ => false
}
}
fn is_whitespace(&mut self, value: char) -> bool {
match value {
// [ \n\t\r]
' ' | '\t' | '\r' => true,
'\n' => {
self.line_no += 1;
self.col_no = 0;
true
}
_ => false
}
}
}
fn main() {
let lexer: LashLexer<io::stdio::StdReader> = LashLexer::new(io::stdio::stdin_raw());
let tokens = lexer.tokens();
println!("Tokens: {}", tokens);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment