Skip to content

Instantly share code, notes, and snippets.

@iamluc
Created December 1, 2015 08:23
Show Gist options
  • Save iamluc/29a5a769907b4f7a6f09 to your computer and use it in GitHub Desktop.
Save iamluc/29a5a769907b4f7a6f09 to your computer and use it in GitHub Desktop.
Simple Lexer in PHP
<?php
class Lexer
{
protected static $tokens = [
'T_PHRASE' => '/\s*"(([^"\\\\]|\\\\\\\\|\\\\"|\\\\)+)"?(\s+|$)/',
'T_OR' => '/\s*(or)(\s+|$)/',
'T_AND' => '/\s*(and)(\s+|$)/',
'T_WILDCARD' => '/\s*(\*)/',
'T_WORD' => '/\s*([^ \t\r\n\v\f\*]+)(\s*|$)/',
];
public function tokenize($subject)
{
$tokens = [];
$offset = 0;
while ($offset < strlen($subject)) {
$token = $this->match(substr($subject, $offset));
if (false === $token) {
throw new Exception(sprintf('Unable to parse subject "%s"', $subject));
}
$offset += $token['size'];
$tokens[] = [$token['token'] => $token['match']];
}
return $tokens;
}
public function match($subject)
{
foreach (self::$tokens as $name => $pattern) {
$matches = [];
if (1 === preg_match($pattern.'A', $subject, $matches)) {
return [
'size' => strlen($matches[0]),
'match' => $matches[1],
'token' => $name,
];
}
}
return false;
}
public function debug(array $tokens)
{
$res = [];
foreach ($tokens as $token) {
$tokens[] = $token['token'];
}
return implode(', ', $res);
}
}
$tests = [
'"salut les amis' => [
['T_PHRASE' => 'salut les amis'],
],
'"salut les" amis' => [
['T_PHRASE' => 'salut les'],
['T_WORD' => 'amis'],
],
'coo "salut les" amis' => [
['T_WORD' => 'coo'],
['T_PHRASE' => 'salut les'],
['T_WORD' => 'amis'],
],
'bob or marley' => [
['T_WORD' => 'bob'],
['T_OR' => 'or'],
['T_WORD' => 'marley'],
],
'"bob or" marley' => [
['T_PHRASE' => 'bob or'],
['T_WORD' => 'marley'],
],
'"bob marley" or "bob dylan"' => [
['T_PHRASE' => 'bob marley'],
['T_OR' => 'or'],
['T_PHRASE' => 'bob dylan'],
],
'symfon* or zf' => [
['T_WORD' => 'symfon'],
['T_WILDCARD' => '*'],
['T_OR' => 'or'],
['T_WORD' => 'zf'],
],
'su*er co*ol and bob' => [
['T_WORD' => 'su'],
['T_WILDCARD' => '*'],
['T_WORD' => 'er'],
['T_WORD' => 'co'],
['T_WILDCARD' => '*'],
['T_WORD' => 'ol'],
['T_AND' => 'and'],
['T_WORD' => 'bob'],
],
];
$lexer = new Lexer();
foreach ($tests as $subject => $expected) {
echo "Testing: ".$subject."\n";
if ($expected !== ($res = $lexer->tokenize($subject))) {
echo "---> Error, expected\n";
print_r($expected);
echo "get\n";
print_r($res);
}
}
@Florian418
Copy link

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment