Last active
February 15, 2023 14:54
-
-
Save DmitrySoshnikov/29f7a9425cdab69ea68f to your computer and use it in GitHub Desktop.
LL-parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* = LL parser = | |
* | |
* by Dmitry Soshnikov <dmitry.soshnikov@gmail.com> | |
* MIT Style license | |
* | |
* Often one can see manually written LL parsers implemented as | |
* recursive descent. Approach in this diff is a classical parse table | |
* state machine. | |
* | |
* LL parser consists of: | |
* | |
* 1. input buffer (source code) | |
* 2. stack | |
* 3. parsing table (state machine) | |
* | |
* Parsing table: | |
* | |
* Table is used to get the next production number to apply, based on current | |
* symbol from the buffer, and the symbol (terminal or non-terminal) | |
* on top of the stack. | |
* | |
* - Rows in the table are non-terminals | |
* - Columns are terminals | |
* | |
* Parsing algorithm: | |
* | |
* - if the top of the stack is *terminal*, and matches current symbol in | |
* buffer, then just discard it from the stack, and move cursor further. | |
* (if doesn't match -- parse error). | |
* | |
* - Else (it must be a non-terminal), replace it with an | |
* alternative production, corresponding to the production number. | |
* (if no production -- parse error). | |
* | |
* $ - is a special symbol used to mark bottom of the stack | |
* and end of the buffer. | |
* | |
* S - is a start symbol. | |
* | |
* At the beginning stack is: | |
* | |
* [S, $] | |
* | |
* Example: | |
* | |
* Grammar: | |
* | |
* 1. S -> F | |
* 2. S -> (S + F) | |
* 3. F -> a | |
* | |
* Input: | |
* | |
* (a + a) | |
* | |
* Parse table: | |
* | |
* +------------------+ | |
* | ( ) a + $ | | |
* +------------------+ | |
* | S 2 - 1 - - | | |
* | F - - 3 - - | | |
* +------------------+ | |
* | |
* The production rules which are applied to parse `(a + a)` are: 2, 1, 3, 3: | |
* | |
* S -> ( S + F ) -> ( F + F ) -> ( a + F ) -> ( a + a ) | |
* | |
* We see that each time the *left most* non-terminal is replaced. Hence, the | |
* name of the parser: LL - scan source from Left to right, and apply the | |
* Left most derivation. | |
*/ | |
/** | |
* Our grammar representation. Key is a production number from | |
* the grammar, the value is: 0 - LHS, 1 - RHS of the production. | |
*/ | |
var grammar = { | |
1: ['S', 'F'], // 1. S -> F | |
2: ['S', '(S + F)'], // 2. S -> (S + F) | |
3: ['F', 'a'], // 3. F -> a | |
}; | |
/** | |
* Initial stack: bottom is the "end of the stack" ($), | |
* and the start symbol ('S' in our case) is there too. | |
*/ | |
var stack = ['S', '$']; | |
function parse(source) { | |
return parseFromTable(source, buildTable(grammar, source)); | |
} | |
function printGrammar(grammar) { | |
console.log('Grammar:\n'); | |
for (var k in grammar) { | |
console.log(' ' + k + '.', grammar[k][0], '->', grammar[k][1]); | |
} | |
console.log(''); | |
} | |
/** | |
* Builds a state-machine table where table[non-terminal][terminal] | |
* coordinates determine which next production rule to apply. | |
*/ | |
function buildTable(grammar, source) { | |
// For now we assume a correct table was already built from | |
// the grammar and source for us. We'll cover how to build it | |
// automatically in the next lessons (see "first" and "follow" | |
// sets topic). We encode only valid rules here and skip all other | |
// (they return `undefined` meaning a parse error). | |
// | |
// +------------------+ | |
// | ( ) a + $ | | |
// +------------------+ | |
// | S 2 - 1 - - | | |
// | F - - 3 - - | | |
// +------------------+ | |
// | |
return { | |
'S': {'(': 2, 'a': 1}, | |
'F': {'a': 3} | |
}; | |
} | |
var productionNumbers = []; | |
/** | |
* Parses a source using parse table. | |
* Doesn't build a parse tree yet, but just checks a source | |
* string for acceptance (prints production rules appled in case | |
* of successful parse, or throws on parse errors). | |
*/ | |
function parseFromTable(source, table) { | |
printGrammar(grammar); | |
console.log('Source:', source); | |
source = source.replace(/\s+/g, ''); | |
for (var cursor = 0; cursor < source.length;) { | |
var current = source[cursor]; | |
var top = stack.shift(); | |
// Terminal is on the stack, just advance. | |
if (isTerminal(top, table) && top === current) { | |
// We already shifted the symbol from the stack, | |
// so just advance the cursor. | |
cursor++; | |
continue; | |
} | |
// Else, it's a non-terminal, do derivation (replace it | |
// in the stack with corresponding production). | |
stack.unshift.apply(stack, getProduction(table, top, current)); | |
} | |
console.log('Accepted. Productions:', productionNumbers.join(', '), '\n'); | |
} | |
function isTerminal(symbol, table) { | |
return !table.hasOwnProperty(symbol); | |
} | |
function getProduction(table, top, current) { | |
var nextProductionNumber = table[top][current]; | |
if (!nextProductionNumber) { | |
throw Error('Parse error, unexpected token: ' + current); | |
} | |
var nextProduction = grammar[nextProductionNumber]; | |
productionNumbers.push(nextProductionNumber); | |
// Return an array of symbols from a production, e.g. | |
// '(', 'S', '+', 'F', ')' for '(S + F)', since | |
// each symbol should be pushed onto the stack. | |
return nextProduction[1].split(/\s*/); | |
} | |
// Test: | |
parse('(a + a)'); | |
// Output: | |
// Grammar: | |
// | |
// 1. S -> F | |
// 2. S -> (S + F) | |
// 3. F -> a | |
// | |
// Source: (a + a) | |
// Accepted. Productions: 2, 1, 3, 3 |
Excellent Thanks!
Awesome code... Thanks bro!! :)
@DmitrySoshnikov Excellent Work.Please make a video and code about LL(k) Parser where k > 1 I mean LL(2) Parser.Thank You.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Awesome. Thanks