Skip to content

Instantly share code, notes, and snippets.

@kaneel
Last active April 18, 2020 12:45
Show Gist options
  • Save kaneel/1617c869f9ab4bab2d20757ae23f247a to your computer and use it in GitHub Desktop.
Save kaneel/1617c869f9ab4bab2d20757ae23f247a to your computer and use it in GitHub Desktop.
/*
1 + 2 / 3 * 4
This example tests the precedence.
/*
(ROOT (OPERATOR +(LITERAL 1)(OPERATOR /(LITERAL 2)(OPERATOR *(LITERAL 3)(LITERAL 4)))))
/*
(1 + 2) / (3 * 4)
This should make sure / is the first node of the tree
/*
(ROOT (OPERATOR /(GROUP (OPERATOR +(LITERAL 1)(LITERAL 2)))(GROUP (OPERATOR *(LITERAL 3)(LITERAL 4)))))
/*
[1, 'hello', 2+3, (2+3)/2]
/*
(ROOT (LIST (LITERAL 1)(LITERAL 'hello')(OPERATOR +(LITERAL 2)(LITERAL 3))(OPERATOR /(GROUP (OPERATOR +(LITERAL 2)(LITERAL 3)))(LITERAL 2))))
/*
1 + call calc
*/
(ROOT (OPERATOR +(LITERAL 1)(CALL calc(ARGUMENTS ))))
/*
let a = b
*/
(ROOT (DECLARE (IDENTIFIER a))(IDENTIFIER b)))
/*
a = 'ho'
*/
(ROOT (ASSIGN (IDENTIFIER a)(LITERAL 'ho')))
/*
def meh arg1 arg2 arg3 {
y = 1;
x = (1 / 2) / y;
};
*/
(ROOT (DEFINE meh(ARGUMENTS (IDENTIFIER arg1)(IDENTIFIER arg2)(IDENTIFIER arg3))(SCOPE (ASSIGN (IDENTIFIER y)(LITERAL 1)))(ASSIGN (IDENTIFIER x)(OPERATOR /(GROUP (OPERATOR /(LITERAL 1)(LITERAL 2)))(IDENTIFIER y)))))
use regex::Regex;
use std::{collections::HashMap, fmt, str::Chars};
const REGEX_IDENTIFIER: &'static str = "[a-zA-Z]";
const REGEX_STRING: &'static str = "['\"]";
const REGEX_NUMBER: &'static str = "[0-9.]";
const REGEX_WORDS: &'static str = "[0-9a-z-A-Z.']";
const REGEX_OPERATOR: &'static str = "[-+*/]";
const REGEX_COMPARE: &'static str = "[<>]";
const SPACE: char = ' ';
const SEMICOLON: char = ';';
const EQUAL: char = '=';
const BANG: char = '!';
const LPAREN: char = '(';
const RPAREN: char = ')';
const LCURLY: char = '{';
const RCURLY: char = '}';
const LBRACKET: char = '[';
const RBRACKET: char = ']';
const COMMA: char = ',';
const NEWLINE: char = '\n';
const CR: char = '\r';
const TAB: char = '\t';
const SEPARATORS: [char; 20] = [
SPACE, SEMICOLON, LPAREN, RPAREN, LCURLY, RCURLY, LBRACKET, RBRACKET, COMMA, NEWLINE, CR, TAB,
EQUAL, BANG, '+', '-', '/', '*', '<', '>',
];
#[derive(Debug, PartialEq, Clone)]
pub enum ElementType {
IDENTIFIER,
STRING,
NUMBER,
INVALID,
EQUAL,
OPERATOR,
COMMA,
LPAREN,
RPAREN,
LCURLY,
RCURLY,
LBRACKET,
RBRACKET,
COMPARE_OPERATOR,
BANG,
SEMICOLON,
}
#[derive(Debug, PartialEq, Clone)]
pub struct CharPosition {
pub col: u32,
pub row: u32,
pub index: usize,
}
impl fmt::Display for CharPosition {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "col:{} row:{}", self.col, self.row)
}
}
#[derive(Debug, PartialEq, Clone)]
pub struct Lexeme<'a> {
pub kind: ElementType,
pub content: &'a str,
pub position: CharPosition,
}
impl<'a> fmt::Display for Lexeme<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Lex({:?}, {}, {})",
self.kind, self.content, self.position
)
}
}
#[derive(Debug, PartialEq)]
pub struct Cursor {
row: u32,
col: u32,
index: usize,
}
impl Cursor {
pub fn new() -> Cursor {
Cursor {
row: 1,
col: 1,
index: 0,
}
}
pub fn mv(&mut self) {
self.col = self.col + 1;
self.index = self.index + 1;
}
pub fn new_line(&mut self) {
self.col = 1;
self.row = self.row + 1;
self.index = self.index + 1;
}
pub fn get_char_position(&self) -> CharPosition {
CharPosition {
row: self.row,
col: self.col,
index: self.index,
}
}
}
pub struct Lexer<'a> {
string: &'a str,
iter: Chars<'a>,
}
impl<'a> Lexer<'a> {
pub fn from(string: &'a str) -> Lexer<'a> {
let iter = string.chars();
Lexer { iter, string }
}
fn scan_until(&self, start: char, separators: Vec<char>, stop: Option<char>) -> String {
let mut chars = self.iter.clone();
let mut accu = vec![start];
while let Some(chr) = chars.next() {
if separators.contains(&chr) {
if stop != None {
accu.push(chr);
}
break;
} else {
accu.push(chr);
}
}
accu.iter()
.map(|chr| chr.to_string())
.collect::<Vec<String>>()
.join("")
}
fn get_utf8_slice(&self, start: usize, end: usize) -> Option<&'a str> {
let string = self.string;
if string.chars().count() == end {
string
.char_indices()
.nth(start)
.map(|(start_pos, _)| &string[start_pos..])
} else {
string.char_indices().nth(start).and_then(|(start_pos, _)| {
string
.char_indices()
.nth(end)
.map(|(end_pos, _)| &string[start_pos..end_pos])
})
}
}
pub fn peek_next(&self) -> Option<char> {
let mut citer = self.iter.clone();
citer.next()
}
pub fn lex(&mut self, regexes: HashMap<&'a str, Regex>) -> Vec<Lexeme<'a>> {
let mut lexemes: Vec<Lexeme<'a>> = vec![];
let mut cursor = Cursor::new();
while let Some(chr) = self.iter.next() {
let character = chr.to_string();
let position = cursor.get_char_position();
let index = position.index;
let token_type: ElementType = match chr {
SPACE | CR | TAB => {
cursor.mv();
continue;
}
NEWLINE => {
cursor.new_line();
continue;
}
EQUAL => ElementType::EQUAL,
BANG => ElementType::BANG,
SEMICOLON => ElementType::SEMICOLON,
LCURLY => ElementType::LCURLY,
RCURLY => ElementType::RCURLY,
LPAREN => ElementType::LPAREN,
RPAREN => ElementType::RPAREN,
LBRACKET => ElementType::LBRACKET,
RBRACKET => ElementType::RBRACKET,
COMMA => ElementType::COMMA,
_ => {
if regexes
.get("identifier")
.unwrap()
.is_match(character.clone().as_str())
{
ElementType::IDENTIFIER
} else if regexes
.get("string")
.unwrap()
.is_match(character.clone().as_str())
{
ElementType::STRING
} else if regexes
.get("number")
.unwrap()
.is_match(character.clone().as_str())
{
ElementType::NUMBER
} else if regexes
.get("operator")
.unwrap()
.is_match(character.clone().as_str())
{
ElementType::OPERATOR
} else if regexes
.get("compare")
.unwrap()
.is_match(character.clone().as_str())
{
ElementType::COMPARE_OPERATOR
} else {
ElementType::INVALID
}
}
};
let lexeme: Lexeme = match token_type {
ElementType::IDENTIFIER => {
let scanned = self.scan_until(chr, SEPARATORS.to_vec(), None);
let range = (index, index + scanned.chars().count());
let content = self.get_utf8_slice(range.0, range.1).unwrap();
Lexeme {
kind: token_type,
content,
position,
}
}
ElementType::NUMBER => {
let scanned = self.scan_until(chr, SEPARATORS.to_vec(), None);
let range = (index, index + scanned.chars().count());
let content = self.get_utf8_slice(range.0, range.1).unwrap();
Lexeme {
kind: token_type,
content,
position,
}
}
ElementType::STRING => {
let scanned = self.scan_until(chr, vec![chr], Some(chr));
let range = (index, index + scanned.chars().count());
let content = self.get_utf8_slice(range.0, range.1).unwrap();
Lexeme {
kind: token_type,
content,
position,
}
}
ElementType::EQUAL => {
let next = self.peek_next();
if next == Some(EQUAL) {
let range = (index, index + 2);
let content = self.get_utf8_slice(range.0, range.1).unwrap_or("?");
Lexeme {
kind: ElementType::COMPARE_OPERATOR,
content,
position,
}
} else {
let range = (index, index + 1);
let content = self.get_utf8_slice(range.0, range.1).unwrap_or("?");
Lexeme {
kind: token_type,
content,
position,
}
}
}
ElementType::COMPARE_OPERATOR => {
let next = self.peek_next();
let range = if next == Some(EQUAL) {
(index, index + 2)
} else {
(index, index + 1)
};
let content = self.get_utf8_slice(range.0, range.1).unwrap_or("?");
Lexeme {
kind: token_type,
content,
position,
}
}
ElementType::BANG => {
let next = self.peek_next();
if next == Some(EQUAL) {
let range = (index, index + 2);
let content = self.get_utf8_slice(range.0, range.1).unwrap_or("?");
Lexeme {
kind: ElementType::COMPARE_OPERATOR,
content,
position,
}
} else {
let range = (index, index + 1);
let content = self.get_utf8_slice(range.0, range.1).unwrap_or("?");
Lexeme {
kind: token_type,
content,
position,
}
}
}
_ => {
let range = (index, index + 1);
let content = self.get_utf8_slice(range.0, range.1).unwrap_or("?");
Lexeme {
kind: token_type,
content,
position,
}
}
};
for _ in 0..lexeme.content.chars().count() - 1 {
self.iter.next();
cursor.mv();
}
cursor.mv();
lexemes.push(lexeme);
}
lexemes
}
}
pub fn lexer<'a>(string: &'a str) -> Vec<Lexeme<'a>> {
let regexes: HashMap<&'a str, Regex> = [
("identifier", Regex::new(REGEX_IDENTIFIER).unwrap()),
("string", Regex::new(REGEX_STRING).unwrap()),
("number", Regex::new(REGEX_NUMBER).unwrap()),
("operator", Regex::new(REGEX_OPERATOR).unwrap()),
("compare", Regex::new(REGEX_COMPARE).unwrap()),
]
.iter()
.cloned()
.collect();
let mut lexer = Lexer::from(string);
let lexeme = lexer.lex(regexes);
lexeme
}
use super::lexer::{CharPosition, ElementType, Lexeme};
use std::{collections::HashMap, fmt, vec::IntoIter};
#[derive(Debug, PartialEq, Clone)]
pub enum NodeType {
ARGUMENTS,
DECLARE,
ASSIGN,
CALL,
BANG,
COMPARE,
DEFINE,
GROUP,
IDENTIFIER,
CONSTANT,
LIST,
LITERAL,
OPERATOR,
ROOT,
SCOPE,
UNKNOWN,
}
#[derive(Debug, PartialEq, Clone)]
pub enum Reserved {
CALL,
DEF,
LET,
IF,
ELSE,
TRUE,
FALSE,
}
#[derive(Debug, PartialEq, Clone)]
pub struct Node<'a> {
pub kind: NodeType,
pub position: Option<CharPosition>,
pub content: Option<&'a str>,
pub children: Option<Vec<Node<'a>>>,
}
impl<'a> Node<'a> {
pub fn new(
kind: NodeType,
content: Option<&'a str>,
position: Option<CharPosition>,
) -> Node<'a> {
Node {
kind,
content,
position,
children: None,
}
}
pub fn append_child(&mut self, child: Node<'a>) {
let children = self.children.as_ref();
let empty_vec = vec![];
let mut v = children.unwrap_or(&empty_vec).to_vec();
v.push(child);
self.children = Some(v);
}
pub fn print(&self) -> String {
let content = String::from(" ") + self.content.unwrap_or("");
let children = self
.children
.as_ref()
.unwrap_or(&vec![])
.into_iter()
.map(|node| node.print())
.collect::<Vec<String>>()
.join("");
format!("({:?}{}{})", self.kind, content, children)
}
}
impl<'a> fmt::Display for Node<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.print())
}
}
pub struct Parser<'a> {
current_token: Option<Lexeme<'a>>,
reserved: HashMap<&'a str, Reserved>,
iter: IntoIter<Lexeme<'a>>,
}
impl<'a> Parser<'a> {
pub fn from(tokens: Vec<Lexeme<'a>>) -> Parser<'a> {
let reserved: HashMap<&'a str, Reserved> = [
("call", Reserved::CALL),
("def", Reserved::DEF),
("if", Reserved::IF),
("let", Reserved::LET),
("else", Reserved::ELSE),
("true", Reserved::TRUE),
("false", Reserved::FALSE),
]
.iter()
.cloned()
.collect();
Parser {
iter: tokens.into_iter(),
reserved,
current_token: None,
}
}
pub fn peek_next(&self) -> Option<Lexeme<'a>> {
let mut citer = self.iter.clone();
citer.next()
}
pub fn is_reserved(&self, lexeme: Option<Lexeme<'a>>) -> bool {
self.reserved.get(lexeme.unwrap().content) != None
}
pub fn accept(&self, accepted_kind: Vec<ElementType>) -> bool {
let next = self.peek_next();
let result = accepted_kind
.into_iter()
.filter(|accepted| match &next {
Some(Lexeme { kind, .. }) => kind == accepted,
None => false,
})
.collect::<Vec<ElementType>>();
result.len() > 0
}
pub fn expect(&self, expected_kind: ElementType) {
let current = self.current_token.as_ref();
let next = self.peek_next();
match next {
Some(Lexeme { position, kind, .. }) => {
if kind != expected_kind {
panic!(
"Expected {:?} but got {:?} at row:{}, col:{}",
expected_kind, kind, position.row, position.col
);
}
}
None => {
if let Some(current) = current {
let Lexeme { position, .. } = current;
panic!(
"expected {:?} but got nothing after row:{}, col:{}",
expected_kind, position.row, position.col
);
}
}
}
}
pub fn make_list(&mut self, list: Node<'a>, rejected_kind: ElementType) -> Node<'a> {
let mut mlist = list;
while match &self.peek_next() {
Some(lexeme) => lexeme.kind != rejected_kind,
None => false,
} {
mlist = self.expression(mlist, None);
}
mlist
}
pub fn make_define(&mut self, node: Node<'a>) -> Node<'a> {
let mut def = Node::new(NodeType::DEFINE, None, None);
let mut args = Node::new(NodeType::ARGUMENTS, None, None);
let mut mnode = node;
self.expect(ElementType::IDENTIFIER);
self.next();
let current = self.current_token.clone().unwrap();
def.content = Some(current.content);
def.position = Some(current.position);
if self.accept(vec![ElementType::IDENTIFIER]) {
// kind of expecting a list of ARGUMENTS
args = self.make_list(args, ElementType::LCURLY);
}
def.append_child(args);
// now we have a left curly brace to start a scope with
self.expect(ElementType::LCURLY);
def = self.expression(def, None);
mnode.append_child(def);
mnode
}
pub fn make_call(&mut self, node: Node<'a>) -> Node<'a> {
let mut call = Node::new(NodeType::CALL, None, None);
let mut args = Node::new(NodeType::ARGUMENTS, None, None);
let mut mnode = node;
self.expect(ElementType::IDENTIFIER);
self.next();
let current = self.current_token.clone().unwrap();
call.content = Some(current.content);
call.position = Some(current.position);
if self.accept(vec![ElementType::IDENTIFIER]) {
// kind of expecting a list of ARGUMENTS
args = self.make_list(args, ElementType::SEMICOLON);
}
call.append_child(args);
mnode.append_child(call);
mnode
}
pub fn make_condition(&mut self, node: Node<'a>) -> Node<'a> {
let mnode = node;
let mnode = self.expression(mnode, None);
self.expect(ElementType::LCURLY);
self.next();
mnode
}
pub fn make_declare(&mut self, node: Node<'a>) -> Node<'a> {
let mut mnode = node;
let next = self.peek_next();
let mut current = self.current_token.as_ref().unwrap();
let mut assign = Node::new(NodeType::DECLARE, None, Some(current.position.clone()));
self.expect(ElementType::IDENTIFIER);
if next != None && self.is_reserved(next) {
panic!(
"Trying to assign to a reserved word, check naming at {}",
current.position.clone()
);
}
self.next();
current = self.current_token.as_ref().unwrap();
let ident = Node::new(
NodeType::IDENTIFIER,
Some(current.content),
Some(current.position.clone()),
);
assign.append_child(ident);
self.expect(ElementType::EQUAL);
self.next();
mnode.append_child(assign);
self.expression(mnode, None)
}
pub fn expression(&mut self, parent_node: Node<'a>, previous: Option<Node<'a>>) -> Node<'a> {
let mut pnode = parent_node;
self.next();
pnode = match &self.current_token {
Some(Lexeme {
content,
position,
kind,
..
}) => match kind {
ElementType::NUMBER => {
let node = Node::new(NodeType::LITERAL, Some(content), Some(position.clone()));
// accept an operator
if self.accept(vec![ElementType::OPERATOR, ElementType::COMPARE_OPERATOR]) {
self.expression(pnode, Some(node))
} else {
pnode.append_child(node);
pnode
}
}
ElementType::STRING => {
let node = Node::new(NodeType::LITERAL, Some(content), Some(position.clone()));
pnode.append_child(node);
pnode
}
ElementType::IDENTIFIER => {
let mut node =
Node::new(NodeType::IDENTIFIER, Some(content), Some(position.clone()));
let reserved = self.reserved.get(content);
if previous != None {}
match reserved {
Some(result) => match result {
Reserved::CALL => {
pnode = self.make_call(pnode);
pnode
}
Reserved::DEF => {
pnode = self.make_define(pnode);
pnode
}
Reserved::IF => {
pnode = self.make_condition(pnode);
pnode
}
Reserved::LET => {
pnode = self.make_declare(pnode);
pnode
}
Reserved::TRUE | Reserved::FALSE => {
node = Node::new(
NodeType::CONSTANT,
Some(content),
Some(position.clone()),
);
pnode.append_child(node);
pnode
}
_ => pnode,
},
None => {
if self.accept(vec![
ElementType::EQUAL,
ElementType::COMPARE_OPERATOR,
ElementType::OPERATOR,
]) {
self.expression(pnode, Some(node))
} else {
pnode.append_child(node);
pnode
}
}
}
}
ElementType::LBRACKET => {
let mut list = Node::new(NodeType::LIST, None, Some(position.clone()));
list = self.make_list(list, ElementType::RBRACKET);
pnode.append_child(list);
self.expression(pnode, None)
}
ElementType::LCURLY => {
let scope = Node::new(NodeType::SCOPE, None, Some(position.clone()));
pnode.append_child(self.expression(scope, None));
self.expression(pnode, None)
}
ElementType::LPAREN => {
let mut group = Node::new(NodeType::GROUP, None, Some(position.clone()));
group = self.expression(group, previous);
if self.accept(vec![ElementType::COMPARE_OPERATOR, ElementType::OPERATOR]) {
self.expression(pnode, Some(group))
} else {
pnode.append_child(group);
self.expression(pnode, None)
}
}
ElementType::BANG => {
let n = Node::new(NodeType::BANG, None, Some(position.clone()));
self.expression(pnode, Some(n))
}
ElementType::COMPARE_OPERATOR => {
let mut n = Node::new(NodeType::COMPARE, Some(content), Some(position.clone()));
if previous != None {
n.append_child(previous.unwrap());
}
pnode.append_child(self.expression(n, None));
pnode
}
ElementType::EQUAL => {
let mut n = Node::new(NodeType::ASSIGN, None, Some(position.clone()));
if previous != None {
n.append_child(previous.unwrap());
}
pnode.append_child(self.expression(n, None));
self.expression(pnode, None)
}
ElementType::OPERATOR => {
let mut n =
Node::new(NodeType::OPERATOR, Some(content), Some(position.clone()));
if previous != None {
n.append_child(previous.unwrap());
}
pnode.append_child(self.expression(n, None));
self.expression(pnode, None)
}
ElementType::SEMICOLON => pnode,
ElementType::COMMA => pnode,
ElementType::RCURLY => pnode,
ElementType::RBRACKET => pnode,
ElementType::RPAREN => pnode,
_ => {
let n = Node::new(NodeType::UNKNOWN, Some(content), Some(position.clone()));
pnode.append_child(n);
pnode
}
},
None => pnode,
};
pnode
}
pub fn parse(&mut self) -> Node<'a> {
let mut node = Node::new(NodeType::ROOT, None, None);
loop {
node = self.expression(node, None);
if self.current_token == None {
break;
}
}
node
}
pub fn next(&mut self) {
let next = self.iter.next();
self.current_token = next;
}
}
pub fn parser<'a>(lexemes: Vec<Lexeme<'a>>) -> Node<'a> {
let mut parser = Parser::from(lexemes);
let tree = parser.parse();
tree
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment