Skip to content

Instantly share code, notes, and snippets.

@kartynnik
Last active August 29, 2021 06:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kartynnik/9a3280e53a8ecbc45021aa7a2f9814d0 to your computer and use it in GitHub Desktop.
Save kartynnik/9a3280e53a8ecbc45021aa7a2f9814d0 to your computer and use it in GitHub Desktop.
Parsing bracket expressions via recursive descent
#!/usr/bin/env python3
"""Demonstrates parsing of bracket expressions via recursive descent.
The formal grammar is as follows (notice the lack of left recursion):
S := '(' S ')' S | '[' S ']' S | '{' S '}' S | '<' S '>' S | ''
Observe how `parse_expression` follows this definition.
"""
import enum
from typing import Iterable, Tuple
class NodeType(enum.Enum):
"""A syntax tree node type according to a possible expansion of `S`."""
ROUND = '()'
SQUARE = '[]'
CURLY = '{}'
ANGLE = '<>'
EMPTY = '""'
@staticmethod
def all_brackets() -> Iterable['NodeType']:
return (NodeType.ROUND,
NodeType.SQUARE,
NodeType.CURLY,
NodeType.ANGLE)
def get_bracket_symbols(self) -> Tuple[str, str]:
if self not in NodeType.all_brackets():
raise ValueError(f'Bracket symbols are not defined for `{self}`')
# The following is a clearer equivalent of `return tuple(self.value)`.
left_bracket, right_bracket = self.value
return left_bracket, right_bracket
class SyntaxTree:
"""An abstract syntax tree of the parsed expression."""
def __init__(self, node_type: NodeType, *children: 'SyntaxTree'):
self._node_type = node_type
self._children = children
def __str__(self):
"""The string corresponding to the subexpression of this tree."""
if self._node_type == NodeType.EMPTY:
return ""
assert self._node_type in NodeType.all_brackets()
left_bracket, right_bracket = self._node_type.get_bracket_symbols()
left_child, right_child = self._children
return f'{left_bracket}{left_child}{right_bracket}{right_child}'
def format(self, indent='', indent_below='') -> str:
"""A textual (pseudographical) representation of the tree."""
lines = [f'{indent}{self._node_type.value} => "{self}"']
if self._children != ():
*non_last_children, last_child = self._children
for non_last_child in non_last_children:
lines.append(non_last_child.format(indent_below + '|-',
indent_below + '| '))
lines.append(last_child.format(indent_below + '\-',
indent_below + ' '))
return '\n'.join(lines)
class Reader:
"""Keeps track of the consumed part of the string and what follows it."""
def __init__(self, string: str):
self._string = string
self._position = 0
def at_end(self) -> bool:
"""Indicates whether the whole string has already been read."""
return self._position >= len(self._string)
def skip(self, character_count: int = 1) -> str:
"""Skips `character_count` characters of the string."""
self._position += character_count
def peek(self, character_count: int = 1) -> str:
"""Returns `character_count` next characters of the string."""
return self._string[self._position: self._position + character_count]
def match(self, substring: str) -> bool:
"""Checks whether the next part of the string matches `substring`."""
return self.peek(len(substring)) == substring
def try_read(self, substring: str) -> bool:
"""If the next part of the string matches `substring`, consumes it."""
if self.match(substring):
self.skip(len(substring))
return True
return False
def do_read(self, substring: str) -> None:
"""Like `try_read` but raises a descriptive `ValueError` on failure."""
if not self.try_read(substring):
raise ValueError(f'Expected "{substring}" at {self.where()}, '
f'got "{self.peek(len(substring))}"')
def where(self) -> str:
"""Returns a textual representation of the current reading position."""
return ('end of input' if self.at_end()
else f'character {self._position + 1}')
def parse_expression(reader: Reader) -> SyntaxTree:
for node_type in NodeType.all_brackets():
left_bracket, right_bracket = node_type.get_bracket_symbols()
if reader.try_read(left_bracket):
left_child = parse_expression(reader)
reader.do_read(right_bracket)
right_child = parse_expression(reader)
return SyntaxTree(node_type, left_child, right_child)
return SyntaxTree(NodeType.EMPTY)
def parse_brackets(string: str) -> SyntaxTree:
"""Returns a `SyntaxTree` for the expression in `string`.
A `ValueError` is raised if the expression does not conform to the grammar.
"""
reader = Reader(string)
result = parse_expression(reader)
# It is possible that only a prefix of the string has been parsed above
# with our approach, since the next character does not allow to continue
# the parsed valid expression to preserve its validity (details omitted).
# In other words, we could imagine that there is a special symbol, say '$',
# which is appended to the string, and the actual grammar is amended by
# R := S '$'
# with R being the new root. Then the present function is the parser of R
# reading S and then virtually invoking `do_read` for the imaginary '$'.
# How to determine the set of what could follow is another matter;
# this touches the theory of LL(1) parsing and can be referenced e.g. here:
# https://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL(1)_parsing_table
if not reader.at_end():
raise ValueError(
f'Expected an opening bracket or end of input '
f'at {reader.where()}, got "{reader.peek()}"')
return result
def test(string: str) -> None:
print(f'"{string}":')
try:
result = parse_brackets(string)
print(result.format())
except ValueError as error:
print(error)
print()
if __name__ == '__main__':
test('(([{}])<>[[](){<>}])')
test('([<{')
test('([<{}>]))')
while True:
expression = input('Enter an expression ("q" to quit): ')
if expression == 'q':
break
test(expression)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment