Last active
January 14, 2018 10:39
-
-
Save ghaiklor/27bccba55cf5530e87584e6735214664 to your computer and use it in GitHub Desktop.
The simplest parser for mathematical expressions with explanation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* https://repl.it/@ghaiklor/The-simplest-math-parser | |
* | |
* What are we going to do? | |
* We are going to implement the simplest parser for mathematical expressions with parenthesis and precedence. | |
* | |
* Why? | |
* Just for get knowing more about software development. | |
* | |
* What will be the result of the parser? | |
* It will be able to parse and interpret expressions like "((2 + 1) * (4 - 2)) + 5". | |
* | |
* by @ghaiklor (https://twitter.com/ghaiklor) | |
*/ | |
/** | |
* Any compiler has a few stages to proceed by de-facto: | |
* | |
* 1) Lexical analysis | |
* 2) Syntax analysis | |
* 3) Semantic analysis | |
* 4) Intermediate Code Generation | |
* 5) Intermediate Code Optimization | |
* 6) Target Code Generation | |
* | |
* @see https://www.geeksforgeeks.org/compiler-design-phases-compiler/ | |
*/ | |
/** | |
* Of course, we are not going to implement all of them. | |
* All we need from the list is lexical analysis and syntax analysis (for our case). | |
* | |
* What is lexical analysis? | |
* Lexical Analysis is the first phase of compiler, also known as scanner. | |
* It converts the input program into a sequence of tokens. | |
* It means, we will get not the string representation of the program. | |
* But the object representation, so it will be more easily to manipulate them. | |
* Anyway, manipulating with objects is more easily than with strings, agreed? | |
* | |
* What is syntax analysis? | |
* Syntax Analysis or Parsing is the second phase, i.e. after lexical analysis. | |
* It checks the syntactical structure of the given input. | |
* i.e. whether the given input is in the correct syntax or not. | |
* It does so by building a data structure, called a Parse tree or Syntax tree. | |
* The parse tree is constructed by using the pre-defined Grammar of the language and the input string. | |
* | |
* @see https://www.geeksforgeeks.org/compiler-lexical-analysis/ | |
* @see https://www.geeksforgeeks.org/compiler-design-introduction-to-syntax-analysis/ | |
*/ | |
/** | |
* Let's begin with implementation already. | |
* As you already read, the first phase is lexical analysis. | |
* Aftwerwards, syntax analysis. | |
*/ | |
/** | |
* A lexical token or simply token is a pair consisting of a token name and an optional token value. | |
* The token name is a category of lexical unit. | |
* Common token names are: | |
* identifier -> x, color, UP | |
* keyword -> if, while, return | |
* separator -> }, (, ; | |
* operator -> +, <, = | |
* literal -> true, 6.02e23, "music" | |
* comment -> // must be negative | |
* | |
* In other words, token is the simplest part of our program. | |
* Speaking linguistically, token is a word from sentence. | |
* i.e. we have the sentence "Lazy Dog", tokens here are "Lazy" and "Dog". | |
* The same in our program -> tokens are words, identifiers, separators, etc... | |
* | |
* @see https://en.wikipedia.org/wiki/Lexical_analysis#Token | |
* @see https://www.geeksforgeeks.org/compiler-lexical-analysis/ | |
*/ | |
class Token { | |
/** | |
* When you are creating a Token, you need to specify its type and its value. | |
* For example, we see the number in our string -> its type is number and value is value from string. | |
* | |
* @param {String} type Enum from static getters of the Token | |
* @param {String} value String value from the input program | |
* @example | |
* new Token(Token.NUMBER, '20'); | |
*/ | |
constructor(type, value) { | |
this.type = type; | |
this.value = value; | |
} | |
/** | |
* Helper method for checking if the token type is as we expect. | |
* | |
* @param {String} type Enum from static getters of the Token | |
* @example | |
* const token = new Token(Token.NUMBER, '20'); | |
* token.is(Token.NUMBER); // true | |
* token.is(Token.OPERATOR); // false | |
*/ | |
is(type) { | |
return this.type === type; | |
} | |
/** | |
* When printing to console for demonstration purposes, it will be converted to this string. | |
* So, it's just for debug purposes and, of course, so you can see how the program is parsing. | |
*/ | |
toString() { | |
return `Token(${this.type},${this.value})`; | |
} | |
/** | |
* Just a static helper for creating tokens. | |
* So we can use it as Token.create(Token.NUMBER, '20'). | |
*/ | |
static create(type, value) { | |
return new this(type, value); | |
} | |
/** | |
* Token type constant for numbers (20, 10, etc) in the program. | |
*/ | |
static get NUMBER() { | |
return 'NUMBER'; | |
} | |
/** | |
* Token type constant for operators (+, -, /, *) in the program. | |
*/ | |
static get OPERATOR() { | |
return 'OPERATOR'; | |
} | |
/** | |
* Token type constant for the symbol "(" in the program. | |
*/ | |
static get LEFT_PARENTHESIS() { | |
return 'LEFT_PARENTHESIS'; | |
} | |
/** | |
* Token type constant for the symbol ")" in the program. | |
*/ | |
static get RIGHT_PARENTHESIS() { | |
return 'RIGHT_PARENTHESIS'; | |
} | |
/** | |
* Token type constant for the end-of-file, when our program is ended. | |
*/ | |
static get EOF() { | |
return 'EOF'; | |
} | |
} | |
/** | |
* That's it from tokens. | |
* As you see it is just a simple class with two properties. | |
* This class is responsible for storing additional infrormation about our chars in the program. | |
* Let us see how can we use it... | |
* NOTE: don't forgot that you need to run this snippet (the button is above) | |
*/ | |
console.log('===| Token Demostration |==='); | |
console.log(Token.create(Token.NUMBER, '20').toString()); | |
console.log(Token.create(Token.OPERATOR, '+').toString()); | |
console.log(Token.create(Token.LEFT_PARENTHESIS, '(').toString()); | |
console.log(Token.create(Token.RIGHT_PARENTHESIS, ')').toString()); | |
console.log(Token.create(Token.EOF, 'EOF').toString()); | |
console.log(''); | |
/** | |
* Scanners... | |
* Here, we are getting the program as a string. | |
* We need to somehow tokenize it (using implemented Tokens above). | |
* So, as a result, we must get not the string, but meaningful parts of the program -> Token instances. | |
* | |
* @see https://www.geeksforgeeks.org/compiler-lexical-analysis/ | |
*/ | |
class Scanner { | |
/** | |
* Our scanner accepts the string of the program. | |
* In our case -> mathematical expression. | |
* | |
* It stores the input into internal property. | |
* Sets the default position of the cursor to 0. | |
* Updates currentChar internal property with the first char from the program. | |
* | |
* Why? | |
* When analyzing the string, we need to move our cursor forward. | |
* We literally must read the string step by step, getting the following characters and analyze them. | |
* Until we see the end of file, of course. | |
* That is why we need to store position where we are now and what is the symbol at this position. | |
*/ | |
constructor(input) { | |
this.input = input; | |
this.position = 0; | |
this.currentChar = this.input[this.position]; | |
} | |
/** | |
* How do we move our cursor forward? | |
* By incrementing the position of it and updating the current character, we see at this position. | |
* This process is called "advance the cursor" or "advancing". | |
* Each time, we think that we're done with the current character, we are calling advance(). | |
*/ | |
advance() { | |
this.position += 1; | |
this.currentChar = this.input[this.position]; | |
} | |
/** | |
* Now, how do we analyze the string via advancing the cursor and seeing the character? | |
* By looking at the character and deciding what to do next, of course :) | |
* For these purposes we are going to implement getNextToken. | |
* Each time this method is called, it will lookup for the next token and return Token instance of it. | |
* | |
* How do this work in few words... | |
* While we have the characters to read, we read them. | |
* By looking up at their values, we are deciding what the token it is. | |
* We see the plus symbol -> create operator token with value "+" -> return it and so on... | |
*/ | |
getNextToken() { | |
// while we have characters to proceed | |
// we return appropriate token type for this character | |
// and don't forgot about advancing our cursor | |
// we're not going to stuck here forever | |
while (this.currentChar) { | |
if (/\s/.test(this.currentChar)) { | |
// We don't want to parse whitespaces, don't we? | |
while (this.currentChar && /\s/.test(this.currentChar)) { | |
this.advance(); | |
} | |
} else if (this.currentChar === '(') { | |
this.advance(); | |
return Token.create(Token.LEFT_PARENTHESIS, '('); | |
} else if (this.currentChar === ')') { | |
this.advance(); | |
return Token.create(Token.RIGHT_PARENTHESIS, ')'); | |
} else if (this.currentChar === '+') { | |
this.advance(); | |
return Token.create(Token.OPERATOR, '+'); | |
} else if (this.currentChar === '-') { | |
this.advance(); | |
return Token.create(Token.OPERATOR, '-'); | |
} else if (this.currentChar === '*') { | |
this.advance(); | |
return Token.create(Token.OPERATOR, '*'); | |
} else if (this.currentChar === '/') { | |
this.advance(); | |
return Token.create(Token.OPERATOR, '/'); | |
} else if (/\d/.test(this.currentChar)) { | |
// We got a number here, yay! | |
// But how do we stack the whole number? | |
// We see only the one character at this moment | |
// By stacking them up until we don't see the digit | |
let number = ''; | |
while (this.currentChar && /\d/.test(this.currentChar)) { | |
number += this.currentChar; | |
this.advance(); | |
} | |
// If we have the dot right after the number | |
// It means that we have a float here | |
if (this.currentChar === '.') { | |
number += '.'; | |
this.advance(); | |
while (this.currentChar && /\d/.test(this.currentChar)) { | |
number += this.currentChar; | |
this.advance(); | |
} | |
} | |
return Token.create(Token.NUMBER, number); | |
} | |
} | |
// When this.currentChar is not available, it means end-of-file | |
return Token.create(Token.EOF, 'EOF'); | |
} | |
} | |
/** | |
* That's all from lexical analysis. | |
* We can see the result or our work right away. | |
* Let's say, we want to get the tokens for this expression: | |
* ((1 + 3) * (10 - 7)) + 8 | |
*/ | |
const scanner = new Scanner('((1 + 3) * (10 - 7)) + 8.25'); | |
console.log('===| Scanner Demostration |==='); | |
console.log('===| ((1 + 3) * (10 - 7)) + 8 |==='); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(scanner.getNextToken().toString()); | |
console.log(); | |
/** | |
* A syntax analyzer or parser takes the input from a lexical analyzer in the form of token streams. | |
* The parser analyzes the source code (token stream) against the production rules. | |
* The output of this phase is a parse tree. | |
* This way, the parser accomplishes two tasks, i.e., parsing the code, and generating a parse tree. | |
* | |
* Production rules is a grammar, actually. | |
* It describes the order of tokens in your program. | |
* I.e. we know, that we can have NUMBER OPERATOR NUMBER, but can't NUMBER NUMBER OPERATOR. | |
* Hope you got the point, but still, highly recommend to you read the link below. | |
* | |
* @see https://www.tutorialspoint.com/compiler_design/compiler_design_syntax_analysis.htm | |
* | |
* We have tokens, we have the first token of our program right here, we know what this token means... | |
* But, what to do next? | |
* | |
* We need to have grammar of our language, in our case -> grammar for mathematical expressions. | |
* It's like regular expressions that matches your program, roughly said. | |
* And, for our language, we are going to use this grammar: | |
* | |
* <expression> ::= <term> + <expression> | <term> - <expression> | <term> | |
* <term> ::= <factor> * <term> | <factor> / <term> | <factor> | |
* <factor> ::= (<expression>) | <number> | |
* | |
* Factor could be the number itself or expression in parenthesis. | |
* Term could be the factor, or factor / term, or factor * term | |
* Expressions could be term, or term - expression, or term + expression. | |
* | |
* Did you see, that some of rules has recursion into itself? | |
* i.e. expression ::= term + expression. | |
* Do not be scared, this recursion has an exit. | |
* You see, expression could be just a term sometimes, so term + expression, at some point... | |
* will be term + term, which is factor + factor at some point, which is number + number at some point. | |
* So, this grammar actually can recursively step into, find out the result of expression and return it. | |
* I know, it's hard to understand at first look, take your time and read the article above once more... | |
* | |
* Let's implement the parser for this grammar! | |
*/ | |
class Parser { | |
/** | |
* Our parser accepts input of the program as a string. | |
* All we need to do right away is to create our scanner for lexical analysis. | |
* Aftwerwards, get the first token from the input. | |
*/ | |
constructor(input) { | |
this.scanner = new Scanner(input); | |
this.currentToken = this.scanner.getNextToken(); | |
} | |
/** | |
* Our next step is the process which is called consuming the tokens. | |
* Some resources uses the name "eating the token". | |
* | |
* This process checks the expected token type against actual one. | |
* If they are similar, we can consume the current token and take the next one. | |
* Otherwise, we got the token that we didn't expect, meaning the error in our program. | |
* Yes, our parser can throw an error if your mathematical expression is not valid. | |
*/ | |
consume(type) { | |
if (this.currentToken.is(type)) { | |
this.currentToken = this.scanner.getNextToken(); | |
} else { | |
throw new Error(`Expected ${type}, but got ${this.currentToken}`); | |
} | |
} | |
/** | |
* We can get stream of tokens... | |
* We can consume them and throw an error if something wrong... | |
* The last step - implement parser for our grammar above. | |
* The process is simple, you just map names of production rules to names of your methods. | |
* Also, body of your function must leads the production rule step-by-step. | |
* Let me copy paste grammar again, so we can map it visually to our methods. | |
* | |
* <expression> ::= <term> + <expression> | <term> - <expression> | <term> | |
* <term> ::= <factor> * <term> | <factor> / <term> | <factor> | |
* <factor> ::= (<expression>) | <number> | |
*/ | |
expression() { | |
const term = this.term(); | |
const token = this.currentToken; | |
if (token.is(Token.OPERATOR) && token.value === '+') { | |
this.consume(Token.OPERATOR); | |
return term + this.expression(); | |
} else if (token.is(Token.OPERATOR) && token.value === '-') { | |
this.consume(Token.OPERATOR); | |
return term - this.expression(); | |
} else { | |
return term; | |
} | |
} | |
term() { | |
const factor = this.factor(); | |
const token = this.currentToken; | |
if (token.is(Token.OPERATOR) && token.value === '*') { | |
this.consume(Token.OPERATOR); | |
return factor * this.term(); | |
} else if (token.is(Token.OPERATOR) && token.value === '/') { | |
this.consume(Token.OPERATOR); | |
return factor / this.term(); | |
} else { | |
return factor; | |
} | |
} | |
factor() { | |
const token = this.currentToken; | |
if (token.is(Token.LEFT_PARENTHESIS)) { | |
this.consume(Token.LEFT_PARENTHESIS); | |
const expr = this.expression(); | |
this.consume(Token.RIGHT_PARENTHESIS); | |
return expr; | |
} else if (token.is(Token.NUMBER)) { | |
this.consume(Token.NUMBER); | |
return parseFloat(token.value); | |
} | |
} | |
/** | |
* We are done with implementing parser for our grammar here. | |
* Now, let us just add method parse() which will parse the expression and gives us the result. | |
*/ | |
parse() { | |
return this.expression(); | |
} | |
} | |
/** | |
* We are ready to test our parser, hooray! | |
*/ | |
console.log('===| Parser Demonstration |=='); | |
console.log('1'); | |
console.log(new Parser('1').parse()); | |
console.log(''); | |
console.log('1 + 5'); | |
console.log(new Parser('1 + 5').parse()); | |
console.log(''); | |
console.log('6 - 2'); | |
console.log(new Parser('6 - 2').parse()); | |
console.log(''); | |
console.log('2 * 4'); | |
console.log(new Parser('2 * 4').parse()); | |
console.log(''); | |
console.log('8 / 2'); | |
console.log(new Parser('8 / 2').parse()); | |
console.log(''); | |
console.log('3 * (5 + 2)'); | |
console.log(new Parser('3 * (5 + 2)').parse()); | |
console.log(''); | |
console.log('((1 + 3) * (10 - 7)) + 8.25'); | |
console.log(new Parser('((1 + 3) * (10 - 7)) + 8.25').parse()); | |
console.log(''); | |
console.log('3.14 * 2.28'); | |
console.log(new Parser('3.14 * 2.28').parse()); | |
'Thanks for Reading'; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment