Skip to content

Instantly share code, notes, and snippets.

@unkind
Created March 18, 2019 01:29
Show Gist options
  • Save unkind/87995556f5d8661aa09f5cf82478ba5e to your computer and use it in GitHub Desktop.
Save unkind/87995556f5d8661aa09f5cf82478ba5e to your computer and use it in GitHub Desktop.
<?php
final class GeneratedLexer
{
private const REGEX = [
'default' => '~\\G
(?|
(?:\\n)(*MARK:skip)
|
(?:\\[\\^)(*MARK:negative_class_)
|
(?:\\[)(*MARK:class_)
|
(?:\\])(*MARK:_class)
|
(?:\\-)(*MARK:range)
|
(?:\\(\\?[\\-\\+]?[imsx]\\))(*MARK:internal_option)
|
(?:\\(\\?=)(*MARK:lookahead_)
|
(?:\\(\\?!)(*MARK:negative_lookahead_)
|
(?:\\(\\?<=)(*MARK:lookbehind_)
|
(?:\\(\\?<!)(*MARK:negative_lookbehind_)
|
(?:\\(\\?\\(<)(*MARK:named_reference_)
|
(?:\\(\\?\\((?=\\d))(*MARK:absolute_reference_)
|
(?:\\(\\?\\((?=[\\+\\-]))(*MARK:relative_reference_)
|
(?:\\(\\?\\()(*MARK:assertion_reference_)
|
(?:\\(\\?\\#)(*MARK:comment_)
|
(?:\\(\\?<)(*MARK:named_capturing_)
|
(?:\\(\\?:)(*MARK:non_capturing_)
|
(?:\\(\\?\\|)(*MARK:non_capturing_reset_)
|
(?:\\(\\?>)(*MARK:atomic_group_)
|
(?:\\()(*MARK:capturing_)
|
(?:\\))(*MARK:_capturing)
|
(?:\\?\\+)(*MARK:zero_or_one_possessive)
|
(?:\\?\\?)(*MARK:zero_or_one_lazy)
|
(?:\\?)(*MARK:zero_or_one)
|
(?:\\*\\+)(*MARK:zero_or_more_possessive)
|
(?:\\*\\?)(*MARK:zero_or_more_lazy)
|
(?:\\*)(*MARK:zero_or_more)
|
(?:\\+\\+)(*MARK:one_or_more_possessive)
|
(?:\\+\\?)(*MARK:one_or_more_lazy)
|
(?:\\+)(*MARK:one_or_more)
|
(?:\\{[0-9]+\\})(*MARK:exactly_n)
|
(?:\\{[0-9]+,[0-9]+\\}\\+)(*MARK:n_to_m_possessive)
|
(?:\\{[0-9]+,[0-9]+\\}\\?)(*MARK:n_to_m_lazy)
|
(?:\\{[0-9]+,[0-9]+\\})(*MARK:n_to_m)
|
(?:\\{[0-9]+,\\}\\+)(*MARK:n_or_more_possessive)
|
(?:\\{[0-9]+,\\}\\?)(*MARK:n_or_more_lazy)
|
(?:\\{[0-9]+,\\})(*MARK:n_or_more)
|
(?:\\|)(*MARK:alternation)
|
(?:\\\\([aefnrt]|c[\\x00-\\x7f]))(*MARK:character)
|
(?:\\\\([0-7]{3}|x[0-9a-zA-Z]{2}|x{[0-9a-zA-Z]+}))(*MARK:dynamic_character)
|
(?:\\\\([CdDhHNRsSvVwWX]|[pP]{[^}]+}))(*MARK:character_type)
|
(?:\\\\([bBAZzG])|\\^|\\$)(*MARK:anchor)
|
(?:\\\\K)(*MARK:match_point_reset)
|
(?:\\\\.|.)(*MARK:literal)
)~x',
'c' => '~\\G
(?|
(?:[\\+\\-]?\\d+)(*MARK:index)
)~x',
'co' => '~\\G
(?|
(?:\\))(*MARK:_comment)
|
(?:.*?(?=(?<!\\\\)\\)))(*MARK:comment)
)~x',
'nc' => '~\\G
(?|
(?:>)(*MARK:_named_capturing)
|
(?:.+?(?=(?<!\\\\)>))(*MARK:capturing_name)
)~x'
];
public static function lex(string $input): Generator
{
$line = 1;
$offset = 0;
$state = 'default';
while ($offset < strlen($input)) {
if (preg_match_all(self::REGEX[$state], $input, $matches, PREG_SET_ORDER, $offset) === false) {
throw new LogicException(sprintf('Regex (state "%s") is broken (error code: %d).', $state, preg_last_error()));
}
foreach ($matches as $match) {
$tokenId = $match['MARK'];
$raw = $match[0];
$line += substr_count($raw, "\n");
$offset += strlen($raw);
switch ($tokenId) {
case 'skip':
continue 2;
case 'named_reference_':
yield ['named_reference_', $raw, $offset];
$state = 'nc';
continue 3;
case 'absolute_reference_':
yield ['absolute_reference_', $raw, $offset];
$state = 'c';
continue 3;
case 'relative_reference_':
yield ['relative_reference_', $raw, $offset];
$state = 'c';
continue 3;
case 'comment_':
yield ['comment_', $raw, $offset];
$state = 'co';
continue 3;
case 'named_capturing_':
yield ['named_capturing_', $raw, $offset];
$state = 'nc';
continue 3;
case 'index':
yield ['index', $raw, $offset];
$state = 'default';
continue 3;
case '_comment':
yield ['_comment', $raw, $offset];
$state = 'default';
continue 3;
case '_named_capturing':
yield ['_named_capturing', $raw, $offset];
$state = 'default';
continue 3;
default:
yield [$tokenId, $raw, $offset];
}
}
if ($offset < strlen($input)) {
throw new RuntimeException(
sprintf(
'Parsing error at line %d (offset: %d) near "%s": unrecognized token.',
$line,
$offset,
substr($input, $offset, 10)
)
);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment