Skip to content

Instantly share code, notes, and snippets.

@rr-
Created October 1, 2014 09:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rr-/4056c97c8d045487ad41 to your computer and use it in GitHub Desktop.
Save rr-/4056c97c8d045487ad41 to your computer and use it in GitHub Desktop.
Example of tokenizer for PHP
<?php
namespace Tokenizer;
class Token
{
public $type;
public $content;
}
$tokenList =
[
'TOKEN_OPEN_TAG',
'TOKEN_NEWLINE',
'TOKEN_TAB',
'TOKEN_SPACE',
'TOKEN_CURLY_OPEN',
'TOKEN_CURLY_CLOSE',
'TOKEN_ROUND_OPEN',
'TOKEN_ROUND_CLOSE',
'TOKEN_SQUARE_OPEN',
'TOKEN_SQUARE_CLOSE',
'TOKEN_SEMICOLON',
'TOKEN_NAMESPACE',
'TOKEN_CLASS',
'TOKEN_FUNCTION',
'TOKEN_IF',
'TOKEN_ELSE',
'TOKEN_ELSEIF',
'TOKEN_SWITCH',
'TOKEN_CASE',
'TOKEN_BREAK',
'TOKEN_FOR',
'TOKEN_FOREACH',
'TOKEN_RETURN',
'TOKEN_WHILE',
'TOKEN_THROW',
'TOKEN_CATCH',
'TOKEN_FINALLY',
'TOKEN_CLASS_VISIBILITY_PRIVATE',
'TOKEN_CLASS_VISIBILITY_PROTECTED',
'TOKEN_CLASS_VISIBILITY_PUBLIC',
'TOKEN_STATIC',
'TOKEN_STRING',
'TOKEN_VARIABLE',
'TOKEN_QUOTED_STRING',
'TOKEN_COMMA',
'TOKEN_OPERATOR_OBJECT',
'TOKEN_OPERATOR_STATIC_OBJECT',
'TOKEN_OPERATOR_ASSIGNMENT',
'TOKEN_OPERATOR_EQUAL',
'TOKEN_OPERATOR_NOT_EQUAL',
'TOKEN_OPERATOR_IDENTICAL',
'TOKEN_OPERATOR_NOT_IDENTICAL',
'TOKEN_OPERATOR_NEGATION',
'TOKEN_OPERATOR_SHIFT_LEFT',
'TOKEN_OPERATOR_SHIFT_RIGHT',
'TOKEN_OPERATOR_SHIFT_LEFT_INPLACE',
'TOKEN_OPERATOR_SHIFT_RIGHT_INPLACE',
'TOKEN_OPERATOR_PLUS',
'TOKEN_OPERATOR_MINUS',
'TOKEN_OPERATOR_MULTIPLY',
'TOKEN_OPERATOR_DIVIDE',
'TOKEN_OPERATOR_CONCATENATE',
'TOKEN_OPERATOR_LESS_OR_EQUAL',
'TOKEN_OPERATOR_GREATER_OR_EQUAL',
'TOKEN_OPERATOR_LESS',
'TOKEN_OPERATOR_POWER',
'TOKEN_OPERATOR_GREATER',
'TOKEN_OPERATOR_SHORTHAND_IF_QUESTIONMARK',
'TOKEN_OPERATOR_SHORTHAND_IF_COLON',
'TOKEN_NAMESPACE_RESOLVER',
];
foreach ($tokenList as $tokenType)
{
define($tokenType, $tokenType);
}
class Tokenizer
{
private $matchers =
[
'<\?php|<\?|<\%' => TOKEN_OPEN_TAG,
"[\r\n]+" => TOKEN_NEWLINE,
"\t" => TOKEN_TAB,
" " => TOKEN_SPACE,
'namespace' => TOKEN_NAMESPACE,
'class' => TOKEN_CLASS,
'function' => TOKEN_FUNCTION,
'if' => TOKEN_IF,
'else' => TOKEN_ELSE,
'elseif' => TOKEN_ELSEIF,
'switch' => TOKEN_SWITCH,
'case' => TOKEN_CASE,
'break' => TOKEN_BREAK,
'for' => TOKEN_FOR,
'foreach' => TOKEN_FOREACH,
'return' => TOKEN_RETURN,
'while' => TOKEN_WHILE,
'throw' => TOKEN_THROW,
'catch' => TOKEN_CATCH,
'finally' => TOKEN_FINALLY,
'private' => TOKEN_CLASS_VISIBILITY_PRIVATE,
'protected' => TOKEN_CLASS_VISIBILITY_PROTECTED,
'public' => TOKEN_CLASS_VISIBILITY_PUBLIC,
'static' => TOKEN_STATIC,
';' => TOKEN_SEMICOLON,
',' => TOKEN_COMMA,
'{' => TOKEN_CURLY_OPEN,
'}' => TOKEN_CURLY_CLOSE,
'\[' => TOKEN_SQUARE_OPEN,
'\]' => TOKEN_SQUARE_CLOSE,
'\(' => TOKEN_ROUND_OPEN,
'\)' => TOKEN_ROUND_CLOSE,
'(["\'])(?:(?!\1)[^\\\]|\\\.)*\1' => TOKEN_QUOTED_STRING,
'\?' => TOKEN_OPERATOR_SHORTHAND_IF_QUESTIONMARK,
':' => TOKEN_OPERATOR_SHORTHAND_IF_COLON,
'=' => TOKEN_OPERATOR_ASSIGNMENT,
'==' => TOKEN_OPERATOR_EQUAL,
'===' => TOKEN_OPERATOR_IDENTICAL,
'!' => TOKEN_OPERATOR_NEGATION,
'<<' => TOKEN_OPERATOR_SHIFT_LEFT,
'>>' => TOKEN_OPERATOR_SHIFT_RIGHT,
'<<=' => TOKEN_OPERATOR_SHIFT_LEFT_INPLACE,
'>>=' => TOKEN_OPERATOR_SHIFT_RIGHT_INPLACE,
'!=' => TOKEN_OPERATOR_NOT_EQUAL,
'!==' => TOKEN_OPERATOR_NOT_IDENTICAL,
'>=' => TOKEN_OPERATOR_GREATER_OR_EQUAL,
'<=' => TOKEN_OPERATOR_LESS_OR_EQUAL,
'\+' => TOKEN_OPERATOR_PLUS,
'-' => TOKEN_OPERATOR_MINUS,
'\*\*' => TOKEN_OPERATOR_POWER,
'\*' => TOKEN_OPERATOR_MULTIPLY,
'\/' => TOKEN_OPERATOR_DIVIDE,
'\.' => TOKEN_OPERATOR_CONCATENATE,
'>' => TOKEN_OPERATOR_GREATER,
'<' => TOKEN_OPERATOR_LESS,
'->' => TOKEN_OPERATOR_OBJECT,
'::' => TOKEN_OPERATOR_STATIC_OBJECT,
'\$\w+' => TOKEN_VARIABLE,
'\w+' => TOKEN_STRING,
'\\\\' => TOKEN_NAMESPACE_RESOLVER,
];
public function tokenizeFile($filePath)
{
return $this->tokenizeBuffer(file_get_contents($filePath));
}
public function tokenizeBuffer($buffer)
{
$currentBuffer = $buffer;
$tokens = [];
while ($currentBuffer !== '')
{
$token = $this->getHeadTokenFromBuffer($currentBuffer);
if (!$token)
{
var_dump($currentBuffer);
throw new \Exception('Syntax error near ' . substr($currentBuffer, 0, 50) . '...');
}
$tokens[] = $token;
if (strlen($token->content) > strlen($currentBuffer))
throw new \Exception('Bad match');
if (strlen($token->content) == strlen($currentBuffer))
$currentBuffer = '';
else
$currentBuffer = substr($currentBuffer, strlen($token->content));
}
return $tokens;
}
private function getHeadTokenFromBuffer($currentBuffer)
{
foreach ($this->matchers as $regex => $tokenType)
{
if (preg_match('/^' . $regex . '/', $currentBuffer, $matches))
{
$token = new Token();
$token->type = $tokenType;
$token->content = $matches[0];
return $token;
}
}
return null;
}
}
$tokenizer = new Tokenizer();
$tokens = $tokenizer->tokenizeFile($argv[0]);
print_r($tokens);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment