Skip to content

Instantly share code, notes, and snippets.

@Aatch
Created October 19, 2015 04:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Aatch/8210c43dddde8775ff73 to your computer and use it in GitHub Desktop.
Save Aatch/8210c43dddde8775ff73 to your computer and use it in GitHub Desktop.
extern crate smallvec;
use std::collections::VecDeque;
use std::io::Read;
use smallvec::SmallVec;
pub type LexResult<T> = std::result::Result<T, Error>;
/**
* Provides functionality for implementing a lexer.
*/
#[derive(Clone)]
pub struct Lexer<R: Read> {
input: R,
lookahead: SmallVec<[u8; 8]>,
is_eof: bool,
line: u32,
column: u32,
}
impl<R: Read> Lexer<R> {
pub fn new(input: R) -> Lexer<R> {
Lexer {
input: input,
lookahead: SmallVec::new(),
is_eof: false,
line: 1,
column: 0
}
}
pub fn into_input(self) -> R {
self.input
}
/// Read a single byte from the input
pub fn read_byte(&mut self) -> LexResult<u8> {
if self.lookahead.len() > 0 {
let b = self.lookahead[0];
self.bump_bytes(1);
return Ok(b);
} else if self.is_eof {
return Err(Error::Eof);
} else {
let mut buf = [0u8];
match self.input.read(&mut buf) {
Ok(0) => {
self.is_eof = true;
return Err(Error::Eof);
}
Err(e) => return Err(Error::Io(e)),
_ => ()
}
if buf[0] == b'\n' {
self.line += 1;
self.column = 0;
} else {
self.column += 1;
}
return Ok(buf[0]);
}
}
/// Look at the next `n` bytes from the input. If there are not enough bytes, the returned
/// slice may be smaller than `n`.
pub fn peek_bytes(&mut self, n: usize) -> LexResult<&[u8]> {
if n <= self.lookahead.len() {
return Ok(&self.lookahead[0..n]);
} else {
let mut cur_len = self.lookahead.len();
let to_read = n - cur_len;
// Fill the extra space we need with 0
for _ in 0..to_read {
self.lookahead.push(0);
}
// Read the appropriate number of bytes from the input
while cur_len < self.lookahead.len() {
// Make the buffer
let buf = &mut self.lookahead[cur_len..n];
match self.input.read(buf) {
// Read zero bytes, thats an eof, but don't treat it as an error, just stop reading
Ok(0) => {
self.is_eof = true;
break;
}
// Read some bytes, bump the current length of the lookahead buffer
Ok(n) => {
cur_len += n;
}
Err(e) => return Err(Error::Io(e))
}
};
// Return a slice into the lookahead buffer
Ok(&self.lookahead[..cur_len])
}
}
/// Try to read a character from the input.
/// Returns None if a valid character cannot be read and input is not consumed.
pub fn read_char(&mut self) -> LexResult<Option<char>> {
let c = try!(self.peek_char());
if let Some(c) = c {
self.bump_bytes(c.len_utf8());
}
Ok(c)
}
/// Try to look a character from the input.
/// Returns None if a valid character cannot be read.
pub fn peek_char(&mut self) -> LexResult<Option<char>> {
let c = {
let bytes = try!(self.peek_bytes(6));
let s = std::str::from_utf8(bytes);
match s {
Ok(s) if s.len() > 0 => {
s.chars().nth(0).unwrap()
}
_ => {
return Ok(None);
}
}
};
Ok(Some(c))
}
/// Matches the input against the given bytes, if they compare equal, the input is consumed and
/// true is returned. Otherwise, false is returned.
pub fn eat_bytes(&mut self, bytes: &[u8]) -> LexResult<bool> {
let eq = {
let look = try!(self.peek_bytes(bytes.len()));
look == bytes
};
if eq {
self.bump_bytes(bytes.len());
Ok(true)
} else {
Ok(false)
}
}
/// If a character read from the input matches the given character, consume the input and
/// return true. Otherwise, false is returned.
pub fn eat_char(&mut self, c: char) -> LexResult<bool> {
let look = try!(self.peek_char());
if look == Some(c) {
self.bump_bytes(c.len_utf8());
Ok(true)
} else {
Ok(false)
}
}
#[inline(always)]
pub fn eat_str(&mut self, s: &str) -> LexResult<bool> {
self.eat_bytes(s.as_bytes())
}
/// Skip input until the first non-whitespace character. Whitespace is defined as a character
/// with White_Space unicode property.
pub fn skip_whitespace(&mut self) -> LexResult<()> {
loop {
if let Some(c) = try!(self.peek_char()) {
if c.is_whitespace() {
self.bump_bytes(c.len_utf8());
} else {
break;
}
}
}
Ok(())
}
/// Discard up to `n` bytes from the lookahead buffer
pub fn bump_bytes(&mut self, n: usize) {
let keep;
// swap the bytes we need to keep to the front of the buffer
if n < self.lookahead.len() {
keep = self.lookahead.len() - n;
for i in 0..keep {
self.lookahead.swap(i, i+n);
}
} else {
keep = 0;
}
for &b in &self.lookahead[keep..] {
if b == b'\n' {
self.line += 1;
self.column = 0;
} else {
self.column += 1;
}
}
for _ in 0..n {
self.lookahead.pop();
}
}
/// Returns the position as (line, column)
pub fn position(&self) -> (u32, u32) {
(self.line, self.column)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment