Skip to content

Instantly share code, notes, and snippets.

@devjack
Last active August 29, 2015 14:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save devjack/516b05626e6475abb486 to your computer and use it in GitHub Desktop.
Save devjack/516b05626e6475abb486 to your computer and use it in GitHub Desktop.
A crude lexer and parser for simple operations and expressions
<?php
/**
* This is a very crude file designed to trace the symbol table for simple operations and expressions. Syntax is based on Java
* - assignment operators
* - basic arithmetic
* - apply multiple operators in a single expression (x + y + z etc.)
*
* Not implemented:
* - complex structures (loops, conditions, arrays, classes, packages etc. etc. etc.)
* - does not support type checking, type hinting (ignored in lexer) or anything 'useful'
*
* INTENTION:
* - midnight coding hack to produce a basic parser for validating student code while teaching basic operations/expressions.
*/
define('LEXER_MATCH_STATEMENT', '/(int)?\s?([a-zA-Z_$][a-zA-Z_$0-9]*)\s?(=)\s?(.+);$/');
define('LEXER_MATCH_EXPRESSION', '/([-+\/*])/');
define('LEXER_STATEMENT', 'statement');
define('OP_ASSIGN', '=');
define('OP_ADDITION', '+');
define('OP_SUBTRACTION', '-');
define('OP_MULTIPLICATION', '*');
define('OP_DIVISION', '/');
function clean_lexer_statement($matches)
{
array_shift($matches); // shift the first element off - its just the input string. PHP is weird.
$return = [];
foreach ($matches as $match) {
$return[] = $match[0];
}
return $return;
}
function parse_line($code)
{
$matches = [];
if (preg_match_all(LEXER_MATCH_STATEMENT, $code, $matches)) {
$tokens = clean_lexer_statement($matches);
/*
* [TYPE, VARIABLE, OPERATOR, EXPRESSION]
*/
if (count($tokens) == 4) {
return [LEXER_STATEMENT => $tokens];
} elseif (count($tokens) == 3) {
// 3 tokens means we have only a (variable) (operator) (expr) sequence
return [LEXER_STATEMENT => [null] + $tokens];
}
throw new \Exception("Lexer identified statement but was unable to parse.");
}
throw new \Exception("Lexer was unable to pass line of code.");
}
function lexer_parse_expression($str)
{
$tokens = preg_split(LEXER_MATCH_EXPRESSION, $str, -1, PREG_SPLIT_DELIM_CAPTURE);
return array_map('trim', $tokens);
}
function parser_apply_operator($left, $op, $right)
{
switch ($op) {
case OP_ADDITION:
return $left + $right;
break;
case OP_SUBTRACTION:
return $left - $right;
break;
case OP_MULTIPLICATION:
return $left * $right;
break;
case OP_DIVISION:
return $left / $right;
break;
}
throw new \Exception("Unknown operator while applying operations");
}
function parser_value_from_token(SymbolTable $symbolTable, $token)
{
if ($symbolTable->exists($token)) {
return $symbolTable->lookup($token); // return the value if its a variable
}
return $token; // assume its a literal - no validation here (future bug no doubt!)
}
function parser_evaluate_expression(SymbolTable $symbolTable, $expressionTokens)
{
// shortcut case - only one value
if (count($expressionTokens) == 1) {
return parser_value_from_token($symbolTable, $expressionTokens[0]);
}
// next we look at the OPS for tokens 0, 1 and 2, removing them from the array as we go
$left = array_shift($expressionTokens);
$op = array_shift($expressionTokens);
$right = array_shift($expressionTokens);
// we take the left three tokens (removed from the array), apply the operation, and re-append it to the tokens as 1
// in short, reducing 3 tokens to 1, incrementally until we hit the shortcut case above.
$expressionTokens = [parser_apply_operator(
parser_value_from_token($symbolTable, $left),
$op,
parser_value_from_token($symbolTable, $right)
)] + $expressionTokens;
return parser_evaluate_expression($symbolTable, $expressionTokens);
}
function parser_do_statement(SymbolTable $symbolTable, $codeTokens)
{
$typehint = $codeTokens[0];
$symbol = $codeTokens[1];
$operation = $codeTokens[2];
$expression = $codeTokens[3];
// for now we only use typehint to determine if this is a 'declare' or a 'define' in java
if ($typehint) {
// must be creating a new variable in symboltable - we'll implement declare/define in future ( TODO )
}
$expression = lexer_parse_expression($expression);
// assume its all maths, i.e. we have to have odd tokens e.g. (VAR) (OPERATOR) (VAR);
if (count($expression) % 2 !== 1) {
throw new Exception("Unexpected expression tokens returned, expected an odd number of tokens");
}
$value = parser_evaluate_expression($symbolTable, $expression);
$symbolTable->set($symbol, $value);
}
class SymbolTable
{
protected $table = [];
public function __construct()
{
$this->table = [];
}
public function exists($name)
{
return array_key_exists($name, $this->table);
}
public function lookup($name)
{
if (array_key_exists($name, $this->table)) {
// the variable name exists in the symbol table.
return $this->table[$name];
}
throw new \Exception("Symbol does not exist. You probably didn't create that variable");
}
public function set($name, $value)
{
$this->table[$name] = $value;
}
public function getSymbols()
{
return array_keys($this->table);
}
public function printTable()
{
// longest symbol name - its only making the printing pretty!
$minWidth = max(array_map('strlen', array_keys($this->table)));
$width = (int)($minWidth * 1.3); // add roughly 30% extra chars for spacing.
foreach ($this->table as $symbol => $value) {
echo $symbol . str_repeat(" ", $width - strlen($symbol)) . " | " . $value . PHP_EOL;
}
}
}
function parser_parse($codeBlock)
{
$symbolTable = new SymbolTable();
$lines = preg_split('/$\R?^/m', $codeBlock); // split by lines - an assumption we make.
foreach ($lines as $line) {
$code = parse_line($line);
if (array_keys($code)[0] == LEXER_STATEMENT) {
// its a basic statement
parser_do_statement($symbolTable, $code[LEXER_STATEMENT]);
}
}
return $symbolTable;
}
if(count($argv) < 2) {
die("runscript.php expects one argument, none given.");
}
require_once('java_parser.php');
$codeSample = file_get_contents($argv[1]);
/**
* @var SymbolTable $symbolTable
*/
$symbolTable = parser_parse($codeSample);
$symbolTable->printTable();
int x = 5;
int frank = 6;
int y = x * frank;
@devjack
Copy link
Author

devjack commented May 6, 2015

Usage:
$ php java_parser.php sample.java

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment