Skip to content

Instantly share code, notes, and snippets.

@drslump
Created August 13, 2009 22:11
Show Gist options
  • Save drslump/167494 to your computer and use it in GitHub Desktop.
Save drslump/167494 to your computer and use it in GitHub Desktop.
A runtime lexer generator which obtains the lexical rules from annotations
<?php
/**
* A runtime lexer generator which obtains the lexical rules from annotations.
*
* Copyright 2009 (c) Iván -DrSlump- Montes <drslump@pollinimini.net>
* Released to the Public Domain
*
* It supports the following annotations:
*
* - @lexer-mode
*
* Sets the matching mode for the lexing engine. It can be either "first"
* or "longest". The former will consume the first rule that matches while
* the later will try to match all the rules and use the one that matches
* the longer amount of text.
* This annotation can only be placed at the class level.
*
* - @lexer-pattern:
*
* Defines a pattern to use in rules. The first value is an identifier,
* the second is a PHP compatible regular expression without delimiters.
* This annotation can be placed at class and method levels.
*
* - @lexer-rule:
*
* Associates the given rule with the class method just below its comment
* block. It's composed of an optional lexer state name ("START" if not
* defined) between < and > characters and one or more pattern identifiers or
* double quotes enclosed strings.
* This annotation can only be placed at the method level.
*
* Example:
*
* /**
* * @lexer-mode first
* * @lexer-pattern whitespace [ \t\r\n]+
* * /
* class MyLexer extends Lexer {
* /** @lexer-rule whitespace +/
* function _space() {
* return self::TOKEN_IGNORE;
* }
*
* /**
* * @lexer-pattern ident ([A-Za-z_]+)
* * @lexer-rule "$" ident
* +/
* function _ident($m) {
* echo "Ident found: " . $m[1] . "\n";
* $this->_token = MyParser::TKN_IDENT;
* }
*
* /** @lexer-rule "'" +/
* function _stringStart() {
* $this->pushState('STRING');
* return self::TOKEN_IGNORE;
* }
*
* /**
* * @lexer-pattern text [^']*
* * @lexer-rule <STRING> text
* +/
* function _stringText() {
* $this->_token = MyParser::TKN_STRING;
* }
*
* /** @lexer-rule <STRING> "'" +/
* function _stringEnd() {
* $this->popState();
* }
* }
*
*/
class Lexer implements Iterator {
// Scanner modes
const MODE_FIRST = 'first';
const MODE_LONGEST = 'longest';
/**
* Consume the token and advance
*/
const TOKEN_ACCEPT = null;
/**
* Repeat at the same position (in another state)
*/
const TOKEN_REPEAT = true;
/**
* Ignore the token and advance
*/
const TOKEN_IGNORE = false;
/**
* Repeat at the same position skiping the rule
*/
const TOKEN_SKIP = 'skip';
// Cache the lexer configuration for each class used
private static $__lexers = array();
// Internal data
private $__mode = self::MODE_FIRST;
private $__classname;
private $__object;
private $__stack = array();
private $__state;
// Lexer current position in the data
protected $__line = 0;
protected $__pos = 0;
protected $__col = 0;
protected $_data;
protected $_token;
protected $_value;
/**
* Constructor
*
* @param string $data The text to scan
* @param stirng | object $class Either a class name or an object
*/
function __construct($data = '', $class = null)
{
$this->__state = 'START';
$this->_data = $data;
if (NULL === $class) {
$this->__classname = get_class($this);
$this->__object = $this;
} else if (is_string($class)) {
$this->__classname = $class;
$this->__object = $class;
} else {
$this->__classname = get_class($class);
$this->__object = $class;
}
}
public function getLine()
{
return $this->__line;
}
public function getOffset()
{
return $this->__pos;
}
public function getColumn()
{
return $this->__col;
}
/**
* Changes the lexer state
*
* @param string $state The new lexer state
* @return void
*/
public function setState( $state )
{
$lexer = $this->__lexer();
if (!isset($lexer['states'][$state])) {
$states = array_keys($lexer['states']);
throw new Exception('State ' . $state . ' not found. Available states: "' .
implode('", "', $states) . '"');
}
$this->__state = $state;
}
/**
* Returns the current state
*
* @return string
*/
public function getState()
{
return $this->__state;
}
/**
* Pushes the current state into a stack and sets a new one
*
* @param string $state
* @return void
*/
public function pushState( $state )
{
$this->__stack[] = $this->__state;
$this->setState($state);
}
/**
* Pops the top state from the stack and makes it the current one
*
* @return void
*/
public function popState()
{
if (!count($this->__stack)) {
throw new Exception('States stack is empty, unable to pop from state "' .
$this->__state . '"');
}
$this->setState(array_pop($this->__stack));
}
/**
* Returns the current token valud
*
* @return string
* @see Iterator
*/
public function current()
{
return $this->_value;
}
/**
* Returns the current token identifier
*
* @return int
* @see Iterator
*/
public function key()
{
return $this->_token;
}
/**
* Resets the lexer to make it as it was just initialized
*
* @param string | null $data If set changes the data string to tokenize
* @see Iterator
*/
public function rewind()
{
$this->__state = 'START';
$this->__stack = array();
$this->__line = 0;
$this->__pos = 0;
$this->__col = 0;
$this->_token = null;
$this->_value = null;
}
/**
* Tells if the lexer is ready to consume more tokens
*
* @return bool
* @see Iterator
*/
public function valid()
{
return $this->__pos < strlen($this->_data);
}
/**
* Consumes the next token from the data
*
* @return void
* @see Iterator
*/
public function next()
{
$lexer = $this->__lexer();
if ($lexer['mode'] === self::MODE_LONGEST) {
$this->__matchLongest();
} else {
$this->__matchFirst();
}
}
/**
* Use this method to apply a lexer definition to skip the runtime building
* process.
*
* @param array $definition A lexer definition
* @return void
* @see getLexerDefinition
*/
public function setLexerDefinition($definition)
{
self::$__lexers[ $this->__classname ] = $definition;
$this->rewind();
}
/**
* Returns the lexer definition so it can be persisted to improve performance
*
* @return array
* @see setLexerDefinition
*/
public function getLexerDefinition()
{
return $this->__lexer();
}
/**
* Tries to match the token with a first-to-match policy
*
* @return void
*/
private function __matchFirst()
{
// Check if the input is over
if ($this->__pos >= strlen($this->_data)) {
return;
}
$lexer = $this->__lexer();
$skip = array();
do {
$state = $this->__state;
// Use the pregenerated regular expression
if (empty($skip)) {
$regexp = $lexer['regexps'][ $state ];
// Build the regexp ignoring the skipped tokens
} else {
$regexp = array();
foreach($lexer['states'][$state] as $idx => $rule) {
if (!in_array($idx, $skip)) {
$regexp[] = $rule['regexp'];
} else {
// Add an impossible to match regexp in its place
$regexp[] = '$^';
}
}
$regexp = '@(' . implode(')|(', $regexp) . ')@Ax';
}
if (preg_match($regexp, $this->_data, $submatches, null, $this->__pos)) {
// Remove empty sub-patterns
$matches = array_filter($submatches, 'strlen');
if (!count($matches)) {
throw new Exception(
'Lexer failed because a rule matched an empty string ' .
'in state "' . $this->__state . '"'
);
}
// Skip global match
next($matches);
// Get lexer token and value
$idx = key($matches);
$this->_value = $value = current($matches);
// Adjust subpatterns count to get the correct index
$token = 0; $subs = 1;
while ($subs < $idx) {
$subs += 1 + $lexer['states'][$state][$token]['subs'];
$token++;
}
$this->_token = $token;
// Extract submatches
$subs = $lexer['states'][$state][$token]['subs'];
$submatches = array_slice($submatches, $idx, $subs);
// Check the rule result
$result = $this->__result($token, $value, $submatches);
if ($result === TRUE) {
return;
} else if ($result === FALSE) {
$skip = array();
continue;
} else {
$skip[] = $result;
continue;
}
} else {
throw new Exception(
'Unexpected input at line' . $this->__line . ' column ' .
$this->__col . ': ' . $this->_data[$this->__pos]
);
}
break;
} while (true);
}
/**
* Tries to match the token with a longest-match policy
*
* @return void
*/
public function __matchLongest()
{
// Check if the input is over
if ($this->__pos >= strlen($this->_data)) {
return false;
}
$lexer = $this->__lexer();
$skip = array();
do {
$state = $this->__state;
$token = 0;
$value = false;
$submatches = array();
foreach ($lexer['states'][$state] as $i => $rule) {
$re = '@' . $rule['regexp'] . '@ASx';
if (!in_array($i, $skip) &&
preg_match($re, $this->_data, $matches, null, $this->__pos)) {
// Check if we found a longer token
if ( $value === false || strlen($matches[0]) > strlen($value) ) {
$value = $matches[0];
$submatches = $matches;
$token = $i;
}
}
}
if ($value === false) {
throw new Exception(
'Unexpected input at line ' . $this->_line . ' column ' .
$this->__col . ': ' . $this->_data[$this->__pos]
);
}
$this->_token = $token;
$this->_value = $value;
// Check the rule result
$result = $this->__result($token, $value, $submatches);
if ($result === TRUE) {
return;
} else if ($result === FALSE) {
$skip = array();
continue;
} else {
$skip[] = $result;
}
} while (true);
}
/**
* Calls the associated method to a rule
*
* If this function returns TRUE means that the token has been accepted.
* If the return value is FALSE means that the token should be ignored.
* If it is an integer it means a rule index and it should be skipped.
*
* @param int $ruleIdx The matched rule index
* @param string $value The matched value
* @param array $submatches The submatches to pass to the method
* @return boolean | string
*/
protected function __result($ruleIdx, $value, $submatches)
{
$lexer = $this->__lexer();
// Keep the current state to detect changes
$state = $this->__state;
// Call the handler
$method = $lexer['states'][$state][$ruleIdx]['method'];
$result = call_user_func( array($this->__object, $method), $submatches );
if ($result === self::TOKEN_ACCEPT) {
$this->__pos += strlen($value);
$this->__line += substr_count($value, "\n");
if ($pos = strrpos($value, "\n")) {
$this->__col = strlen($value) - $pos;
} else {
$this->__col += strlen($value);
}
// accept this token
return true;
} elseif ($result === self::TOKEN_REPEAT) {
if ($state === $this->__state) {
throw new Exception('Rule signalled a repeat but didn\'t change the lexer state');
}
// we have changed state process this token in the new state
$this->next();
return false;
} elseif ($result === self::TOKEN_IGNORE) {
$this->__pos += strlen($value);
$this->__line += substr_count($value, "\n");
if ($pos = strrpos($value, "\n")) {
$this->__col = strlen($value) - $pos;
} else {
$this->__col += strlen($value);
}
// end of input
if ($this->__pos >= strlen($this->_data)) {
return true;
}
// skip this token
return false;
} elseif ($result === self::TOKEN_SKIP) {
if ($state === $this->__state) {
return $ruleIdx;
} else {
// We have changed the state so this is like IGNORE
$this->next();
return true;
}
} else {
throw new Exception(
'Rule returned an invalid value. Only NULL, TRUE and FALSE ' .
'and "skip" values are allowed'
);
}
}
/**
* Returns the lexer data structure and triggers the build process if needed
*
* @return array The lexer data structure
*/
protected function __lexer()
{
if (!isset(self::$__lexers[$this->__classname])) {
self::$__lexers[$this->__classname] = $this->__build();
}
return self::$__lexers[$this->__classname];
}
/**
* Builds the lexer data structure by following the annotations found in the
* target class
*
* @return array The data structure
*/
protected function __build()
{
$states = array();
$regexps = array();
$patterns = array();
$rules = array();
$refl = new ReflectionClass($this->__classname);
// Get global configuration
$tags = $this->__getCommentTags( $refl->getDocComment() );
foreach($tags as $tag) {
if ($tag[0] === 'lexer-pattern') {
if (!preg_match('/^([A-Za-z_]+)\s+(.+)$/', $tag[1], $m)) {
throw new Exception('Malformed @lexer-alias: ' . $tag[1]);
}
$patterns[$m[1]] = rtrim($m[2]);
} else if ($tag[0] === 'lexer-mode') {
if (!preg_match('/^(longest|first)\s*$/', $tag[1], $m)) {
throw new Exception('Malformed @lexer-mode. Only "longest" and "first" are allowed. Found: ' . $tag[1]);
}
$this->__mode = $m[1] === 'first' ? self::MODE_FIRST : self::MODE_LONGEST;
}
}
// Get function rules
$methods = $refl->getMethods();
foreach($methods as $method) {
$tags = $this->__getCommentTags( $method->getDocComment() );
foreach ($tags as $tag) {
if ($tag[0] === 'lexer-pattern') {
if (!preg_match('/^([A-Za-z_]+)\s+(.*)$/', $tag[1], $m)) {
throw new Exception('Malformed @lexer-alias: ' . $tag[1]);
}
$patterns[$m[1]] = rtrim($m[2]);
} else if ($tag[0] === 'lexer-rule') {
if (!preg_match('/^(?:<([A-Za-z_]+)>\s+)?(.+)$/', $tag[1], $m)) {
throw new Exception('Malformed @lexer-parser: ' . $tag[1]);
}
$rules[] = array(
'state' => $m[1] ? $m[1] : 'START',
'method' => $method->getName(),
'expression'=> trim($m[2])
);
}
}
}
// Build the states
foreach ($rules as $rule) {
// Create the state if doesn't exists yet
if (!isset($states[ $rule['state'] ])) {
$states[ $rule['state'] ] = array();
}
$expr = $rule['expression'] . ' ';
$regexp = '';
$pos = 0;
while (preg_match('/"([^"]+)"\s+|([A-Za-z_][A-Za-z0-9_]*)\s+/', $expr, $m, null, $pos)) {
$pos += strlen($m[0]);
if (!empty($m[1])) {
$regexp .= preg_quote($m[1]);
} else {
if (!isset($patterns[$m[2]])) {
throw new Exception('Pattern not found: ' . $m[2]);
}
$regexp .= $patterns[$m[2]];
}
}
if ($pos < strlen($expr)) {
throw new Exception('Malformed rule expression: ' . $rule['expression']);
}
// Calculate the number of subpatterns in the regular expression
// Remove escaped backslashes
$str = str_replace('\\\\', '', $regexp);
// Remove escaped parenthesis
$str = str_replace('\\(', '', $str);
// Remove non-capturing sets
$str = str_replace('(?:', '', $str);
$count = substr_count($str, '(');
$states[ $rule['state'] ][] = array(
'method' => $rule['method'],
'regexp' => str_replace('@', '\@', $regexp),
'subs' => $count
);
}
// For first-match mode build the global regexps and mapping
if ($this->__mode === 'first') {
foreach ($states as $state => $rules) {
$res = array();
foreach($rules as $rule) {
$res[] = $rule['regexp'];
}
$regexps[$state] = '@(' . implode(')|(', $res) . ')@ASx';
}
}
// Store the lexer
return array(
'mode' => $this->__mode,
'states' => $states,
'regexps' => $regexps
);
}
/**
* Parses a comment looking for annotations
*
* @param string $comment
* @return array
*/
private function __getCommentTags($comment)
{
$tags = array();
if (strpos($comment, '/**') !== 0) {
return $tags;
}
// Remove start and end marks
$comment = substr($comment, 3, -2);
$lines = explode("\n", $comment);
foreach ($lines as $ln) {
$ln = ltrim($ln, " \t*");
if (preg_match('/^@([A-Za-z_][A-Za-z_-]*)\s+(.*)$/', $ln, $m)) {
$tags[] = array( $m[1], $m[2] );
}
}
return $tags;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment