Last active
August 29, 2015 14:20
-
-
Save devjack/516b05626e6475abb486 to your computer and use it in GitHub Desktop.
A crude lexer and parser for simple operations and expressions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* This is a very crude file designed to trace the symbol table for simple operations and expressions. Syntax is based on Java | |
* - assignment operators | |
* - basic arithmetic | |
* - apply multiple operators in a single expression (x + y + z etc.) | |
* | |
* Not implemented: | |
* - complex structures (loops, conditions, arrays, classes, packages etc. etc. etc.) | |
* - does not support type checking, type hinting (ignored in lexer) or anything 'useful' | |
* | |
* INTENTION: | |
* - midnight coding hack to produce a basic parser for validating student code while teaching basic operations/expressions. | |
*/ | |
define('LEXER_MATCH_STATEMENT', '/(int)?\s?([a-zA-Z_$][a-zA-Z_$0-9]*)\s?(=)\s?(.+);$/'); | |
define('LEXER_MATCH_EXPRESSION', '/([-+\/*])/'); | |
define('LEXER_STATEMENT', 'statement'); | |
define('OP_ASSIGN', '='); | |
define('OP_ADDITION', '+'); | |
define('OP_SUBTRACTION', '-'); | |
define('OP_MULTIPLICATION', '*'); | |
define('OP_DIVISION', '/'); | |
function clean_lexer_statement($matches) | |
{ | |
array_shift($matches); // shift the first element off - its just the input string. PHP is weird. | |
$return = []; | |
foreach ($matches as $match) { | |
$return[] = $match[0]; | |
} | |
return $return; | |
} | |
function parse_line($code) | |
{ | |
$matches = []; | |
if (preg_match_all(LEXER_MATCH_STATEMENT, $code, $matches)) { | |
$tokens = clean_lexer_statement($matches); | |
/* | |
* [TYPE, VARIABLE, OPERATOR, EXPRESSION] | |
*/ | |
if (count($tokens) == 4) { | |
return [LEXER_STATEMENT => $tokens]; | |
} elseif (count($tokens) == 3) { | |
// 3 tokens means we have only a (variable) (operator) (expr) sequence | |
return [LEXER_STATEMENT => [null] + $tokens]; | |
} | |
throw new \Exception("Lexer identified statement but was unable to parse."); | |
} | |
throw new \Exception("Lexer was unable to pass line of code."); | |
} | |
function lexer_parse_expression($str) | |
{ | |
$tokens = preg_split(LEXER_MATCH_EXPRESSION, $str, -1, PREG_SPLIT_DELIM_CAPTURE); | |
return array_map('trim', $tokens); | |
} | |
function parser_apply_operator($left, $op, $right) | |
{ | |
switch ($op) { | |
case OP_ADDITION: | |
return $left + $right; | |
break; | |
case OP_SUBTRACTION: | |
return $left - $right; | |
break; | |
case OP_MULTIPLICATION: | |
return $left * $right; | |
break; | |
case OP_DIVISION: | |
return $left / $right; | |
break; | |
} | |
throw new \Exception("Unknown operator while applying operations"); | |
} | |
function parser_value_from_token(SymbolTable $symbolTable, $token) | |
{ | |
if ($symbolTable->exists($token)) { | |
return $symbolTable->lookup($token); // return the value if its a variable | |
} | |
return $token; // assume its a literal - no validation here (future bug no doubt!) | |
} | |
function parser_evaluate_expression(SymbolTable $symbolTable, $expressionTokens) | |
{ | |
// shortcut case - only one value | |
if (count($expressionTokens) == 1) { | |
return parser_value_from_token($symbolTable, $expressionTokens[0]); | |
} | |
// next we look at the OPS for tokens 0, 1 and 2, removing them from the array as we go | |
$left = array_shift($expressionTokens); | |
$op = array_shift($expressionTokens); | |
$right = array_shift($expressionTokens); | |
// we take the left three tokens (removed from the array), apply the operation, and re-append it to the tokens as 1 | |
// in short, reducing 3 tokens to 1, incrementally until we hit the shortcut case above. | |
$expressionTokens = [parser_apply_operator( | |
parser_value_from_token($symbolTable, $left), | |
$op, | |
parser_value_from_token($symbolTable, $right) | |
)] + $expressionTokens; | |
return parser_evaluate_expression($symbolTable, $expressionTokens); | |
} | |
function parser_do_statement(SymbolTable $symbolTable, $codeTokens) | |
{ | |
$typehint = $codeTokens[0]; | |
$symbol = $codeTokens[1]; | |
$operation = $codeTokens[2]; | |
$expression = $codeTokens[3]; | |
// for now we only use typehint to determine if this is a 'declare' or a 'define' in java | |
if ($typehint) { | |
// must be creating a new variable in symboltable - we'll implement declare/define in future ( TODO ) | |
} | |
$expression = lexer_parse_expression($expression); | |
// assume its all maths, i.e. we have to have odd tokens e.g. (VAR) (OPERATOR) (VAR); | |
if (count($expression) % 2 !== 1) { | |
throw new Exception("Unexpected expression tokens returned, expected an odd number of tokens"); | |
} | |
$value = parser_evaluate_expression($symbolTable, $expression); | |
$symbolTable->set($symbol, $value); | |
} | |
class SymbolTable | |
{ | |
protected $table = []; | |
public function __construct() | |
{ | |
$this->table = []; | |
} | |
public function exists($name) | |
{ | |
return array_key_exists($name, $this->table); | |
} | |
public function lookup($name) | |
{ | |
if (array_key_exists($name, $this->table)) { | |
// the variable name exists in the symbol table. | |
return $this->table[$name]; | |
} | |
throw new \Exception("Symbol does not exist. You probably didn't create that variable"); | |
} | |
public function set($name, $value) | |
{ | |
$this->table[$name] = $value; | |
} | |
public function getSymbols() | |
{ | |
return array_keys($this->table); | |
} | |
public function printTable() | |
{ | |
// longest symbol name - its only making the printing pretty! | |
$minWidth = max(array_map('strlen', array_keys($this->table))); | |
$width = (int)($minWidth * 1.3); // add roughly 30% extra chars for spacing. | |
foreach ($this->table as $symbol => $value) { | |
echo $symbol . str_repeat(" ", $width - strlen($symbol)) . " | " . $value . PHP_EOL; | |
} | |
} | |
} | |
function parser_parse($codeBlock) | |
{ | |
$symbolTable = new SymbolTable(); | |
$lines = preg_split('/$\R?^/m', $codeBlock); // split by lines - an assumption we make. | |
foreach ($lines as $line) { | |
$code = parse_line($line); | |
if (array_keys($code)[0] == LEXER_STATEMENT) { | |
// its a basic statement | |
parser_do_statement($symbolTable, $code[LEXER_STATEMENT]); | |
} | |
} | |
return $symbolTable; | |
} | |
if(count($argv) < 2) { | |
die("runscript.php expects one argument, none given."); | |
} | |
require_once('java_parser.php'); | |
$codeSample = file_get_contents($argv[1]); | |
/** | |
* @var SymbolTable $symbolTable | |
*/ | |
$symbolTable = parser_parse($codeSample); | |
$symbolTable->printTable(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int x = 5; | |
int frank = 6; | |
int y = x * frank; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage:
$ php java_parser.php sample.java