Last active
August 29, 2021 06:21
-
-
Save kartynnik/9a3280e53a8ecbc45021aa7a2f9814d0 to your computer and use it in GitHub Desktop.
Parsing bracket expressions via recursive descent
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Demonstrates parsing of bracket expressions via recursive descent. | |
The formal grammar is as follows (notice the lack of left recursion): | |
S := '(' S ')' S | '[' S ']' S | '{' S '}' S | '<' S '>' S | '' | |
Observe how `parse_expression` follows this definition. | |
""" | |
import enum | |
from typing import Iterable, Tuple | |
class NodeType(enum.Enum): | |
"""A syntax tree node type according to a possible expansion of `S`.""" | |
ROUND = '()' | |
SQUARE = '[]' | |
CURLY = '{}' | |
ANGLE = '<>' | |
EMPTY = '""' | |
@staticmethod | |
def all_brackets() -> Iterable['NodeType']: | |
return (NodeType.ROUND, | |
NodeType.SQUARE, | |
NodeType.CURLY, | |
NodeType.ANGLE) | |
def get_bracket_symbols(self) -> Tuple[str, str]: | |
if self not in NodeType.all_brackets(): | |
raise ValueError(f'Bracket symbols are not defined for `{self}`') | |
# The following is a clearer equivalent of `return tuple(self.value)`. | |
left_bracket, right_bracket = self.value | |
return left_bracket, right_bracket | |
class SyntaxTree: | |
"""An abstract syntax tree of the parsed expression.""" | |
def __init__(self, node_type: NodeType, *children: 'SyntaxTree'): | |
self._node_type = node_type | |
self._children = children | |
def __str__(self): | |
"""The string corresponding to the subexpression of this tree.""" | |
if self._node_type == NodeType.EMPTY: | |
return "" | |
assert self._node_type in NodeType.all_brackets() | |
left_bracket, right_bracket = self._node_type.get_bracket_symbols() | |
left_child, right_child = self._children | |
return f'{left_bracket}{left_child}{right_bracket}{right_child}' | |
def format(self, indent='', indent_below='') -> str: | |
"""A textual (pseudographical) representation of the tree.""" | |
lines = [f'{indent}{self._node_type.value} => "{self}"'] | |
if self._children != (): | |
*non_last_children, last_child = self._children | |
for non_last_child in non_last_children: | |
lines.append(non_last_child.format(indent_below + '|-', | |
indent_below + '| ')) | |
lines.append(last_child.format(indent_below + '\-', | |
indent_below + ' ')) | |
return '\n'.join(lines) | |
class Reader: | |
"""Keeps track of the consumed part of the string and what follows it.""" | |
def __init__(self, string: str): | |
self._string = string | |
self._position = 0 | |
def at_end(self) -> bool: | |
"""Indicates whether the whole string has already been read.""" | |
return self._position >= len(self._string) | |
def skip(self, character_count: int = 1) -> str: | |
"""Skips `character_count` characters of the string.""" | |
self._position += character_count | |
def peek(self, character_count: int = 1) -> str: | |
"""Returns `character_count` next characters of the string.""" | |
return self._string[self._position: self._position + character_count] | |
def match(self, substring: str) -> bool: | |
"""Checks whether the next part of the string matches `substring`.""" | |
return self.peek(len(substring)) == substring | |
def try_read(self, substring: str) -> bool: | |
"""If the next part of the string matches `substring`, consumes it.""" | |
if self.match(substring): | |
self.skip(len(substring)) | |
return True | |
return False | |
def do_read(self, substring: str) -> None: | |
"""Like `try_read` but raises a descriptive `ValueError` on failure.""" | |
if not self.try_read(substring): | |
raise ValueError(f'Expected "{substring}" at {self.where()}, ' | |
f'got "{self.peek(len(substring))}"') | |
def where(self) -> str: | |
"""Returns a textual representation of the current reading position.""" | |
return ('end of input' if self.at_end() | |
else f'character {self._position + 1}') | |
def parse_expression(reader: Reader) -> SyntaxTree: | |
for node_type in NodeType.all_brackets(): | |
left_bracket, right_bracket = node_type.get_bracket_symbols() | |
if reader.try_read(left_bracket): | |
left_child = parse_expression(reader) | |
reader.do_read(right_bracket) | |
right_child = parse_expression(reader) | |
return SyntaxTree(node_type, left_child, right_child) | |
return SyntaxTree(NodeType.EMPTY) | |
def parse_brackets(string: str) -> SyntaxTree: | |
"""Returns a `SyntaxTree` for the expression in `string`. | |
A `ValueError` is raised if the expression does not conform to the grammar. | |
""" | |
reader = Reader(string) | |
result = parse_expression(reader) | |
# It is possible that only a prefix of the string has been parsed above | |
# with our approach, since the next character does not allow to continue | |
# the parsed valid expression to preserve its validity (details omitted). | |
# In other words, we could imagine that there is a special symbol, say '$', | |
# which is appended to the string, and the actual grammar is amended by | |
# R := S '$' | |
# with R being the new root. Then the present function is the parser of R | |
# reading S and then virtually invoking `do_read` for the imaginary '$'. | |
# How to determine the set of what could follow is another matter; | |
# this touches the theory of LL(1) parsing and can be referenced e.g. here: | |
# https://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL(1)_parsing_table | |
if not reader.at_end(): | |
raise ValueError( | |
f'Expected an opening bracket or end of input ' | |
f'at {reader.where()}, got "{reader.peek()}"') | |
return result | |
def test(string: str) -> None: | |
print(f'"{string}":') | |
try: | |
result = parse_brackets(string) | |
print(result.format()) | |
except ValueError as error: | |
print(error) | |
print() | |
if __name__ == '__main__': | |
test('(([{}])<>[[](){<>}])') | |
test('([<{') | |
test('([<{}>]))') | |
while True: | |
expression = input('Enter an expression ("q" to quit): ') | |
if expression == 'q': | |
break | |
test(expression) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment