|
#!/usr/bin/env python3 |
|
|
|
# Use typing annotations in Python < 3.11 |
|
from __future__ import annotations |
|
|
|
# Internal dependencies |
|
import os |
|
from dataclasses import dataclass |
|
from typing import Dict, List, Final, Tuple, Optional, Set |
|
|
|
# Installed dependencies |
|
import pandas as pd |
|
|
|
# Types |
|
WordStr = str |
|
GrammarRulesStr = str |
|
LookAheadToggle = bool |
|
GrammarNonTerminal = str |
|
GrammarTerminal = str |
|
GrammarProduction = List[GrammarTerminal | GrammarNonTerminal] |
|
GrammarRules = Dict[GrammarNonTerminal, List[GrammarProduction]] |
|
FirstSets = Dict[GrammarNonTerminal, Set[GrammarTerminal]] |
|
FollowSets = Dict[GrammarNonTerminal, Set[GrammarTerminal]] |
|
Arity = int |
|
Derivation = Tuple[GrammarTerminal | GrammarNonTerminal, Arity] |
|
DerivationHistory = List[Derivation] |
|
Stack = List[GrammarTerminal | GrammarNonTerminal] |
|
InputBuffer = List[str] |
|
Action = str |
|
History = List[ |
|
Tuple[Stack, InputBuffer, Action, Optional[Derivation], Optional[DerivationHistory]] |
|
] |
|
|
|
|
|
@dataclass |
|
class DerivationTreeNode: |
|
symbol: str |
|
children: Optional[List[DerivationTreeNode]] = None |
|
|
|
def __repr__(self): |
|
if self.children is None or len(self.children) == 0: |
|
return f"{self.symbol!r}" |
|
return f"{self.symbol!r} $\\rightarrow$ ({', '.join(repr(child) for child in self.children)})" |
|
|
|
|
|
# Runtime data |
|
data: Final[List[Tuple[WordStr, GrammarRulesStr, LookAheadToggle]]] = [ |
|
( |
|
"dnvdn", |
|
"S → NP VP\n" + "NP → 'd' 'n'\n" + "VP → 'v' NP NP | 'v' NP | 'v'", |
|
False, |
|
), |
|
( |
|
"(3*(7/-2))", |
|
"S → '(' S Op S ')' | Num\n" |
|
+ "Num → Sign Num | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' | '0' \n" |
|
+ "Op → '+' | '-' | '*' | '/'\n" |
|
+ "Sign → '-'\n", |
|
True, |
|
), |
|
] |
|
|
|
|
|
def get_grammar_rules_from_string(grammar_rules_str: str) -> GrammarRules: |
|
rules: GrammarRules = dict() |
|
for grammar_rule_str in grammar_rules_str.splitlines(): |
|
non_terminal, production_str = tuple(grammar_rule_str.split(" → ", 1)) |
|
if non_terminal not in rules: |
|
rules[non_terminal] = list() |
|
for production in production_str.split(" | "): |
|
rules[non_terminal].append([x.strip("'") for x in production.split(" ")]) |
|
return rules |
|
|
|
|
|
def calculate_first_sets(grammar_rules: GrammarRules) -> FirstSets: |
|
first_sets: FirstSets = dict() |
|
for non_terminal, productions in grammar_rules.items(): |
|
if non_terminal not in first_sets: |
|
first_sets[non_terminal] = set() |
|
|
|
def find_first_set(symbol: str) -> Set[str]: |
|
first_set = set() |
|
if symbol in grammar_rules: |
|
for production in grammar_rules[symbol]: |
|
if production[0] not in grammar_rules: |
|
first_set.add(production[0]) |
|
first_set = first_set.union(find_first_set(production[0])) |
|
return first_set |
|
|
|
first_sets[non_terminal] = first_sets[non_terminal].union( |
|
find_first_set(non_terminal) |
|
) |
|
return first_sets |
|
|
|
|
|
def calculate_follow_set(grammar_rules: GrammarRules) -> FollowSets: |
|
first_sets = calculate_first_sets(grammar_rules) |
|
follow_sets: FollowSets = {"S": {"$"}} |
|
for non_terminal, productions in grammar_rules.items(): |
|
if non_terminal not in follow_sets: |
|
follow_sets[non_terminal] = set() |
|
for production in productions: |
|
for prod_non_terminal in production[1:]: |
|
if prod_non_terminal not in grammar_rules: |
|
continue |
|
follow_sets[non_terminal] = follow_sets[non_terminal].union( |
|
first_sets[non_terminal] |
|
) |
|
return follow_sets |
|
|
|
|
|
def build_tree( |
|
derivations: List[Derivation], |
|
) -> Tuple[DerivationTreeNode, List[Derivation]]: |
|
if len(derivations) == 0: |
|
raise RuntimeError("Empty derivation list!") |
|
symbol, arity = derivations[0] |
|
derivations_rest = derivations[1:] |
|
subtrees = [] |
|
for i in range(arity): |
|
subtree, derivations_rest = build_tree(derivations_rest) |
|
subtrees.append(subtree) |
|
return DerivationTreeNode(symbol=symbol, children=subtrees), derivations_rest |
|
|
|
|
|
def top_down_parser_backtrack( |
|
grammar_rules: GrammarRules, |
|
input_buffer: InputBuffer, |
|
stack: Optional[Stack] = None, |
|
history: Optional[History] = None, |
|
derivation_history: Optional[DerivationHistory] = None, |
|
look_ahead: Optional[Tuple[FirstSets, FollowSets]] = None, |
|
stop_at_first_solution: bool = True, |
|
) -> Tuple[History, bool]: |
|
if stack is None: |
|
stack = ["S"] |
|
if history is None: |
|
history = [(stack.copy(), input_buffer.copy(), "start", None, None)] |
|
if derivation_history is None: |
|
derivation_history = [] |
|
# Termination condition / Accept |
|
if len(stack) == 0 and len(input_buffer) == 0: |
|
history.append( |
|
( |
|
stack.copy(), |
|
input_buffer.copy(), |
|
f"accept{'' if stop_at_first_solution else ', backtrack'}", |
|
None, |
|
derivation_history, |
|
) |
|
) |
|
return history, True |
|
# Match |
|
if len(stack) > 0 and len(input_buffer) > 0 and stack[-1] == input_buffer[0]: |
|
derivation_match: Derivation = (stack[-1], 0) |
|
history.append( |
|
( |
|
stack[:-1], |
|
input_buffer[1:], |
|
f"match({stack[-1]!r})", |
|
derivation_match, |
|
None, |
|
) |
|
) |
|
new_history, accept = top_down_parser_backtrack( |
|
grammar_rules, |
|
input_buffer[1:], |
|
stack[:-1], |
|
[], |
|
derivation_history=derivation_history + [derivation_match], |
|
look_ahead=look_ahead, |
|
stop_at_first_solution=stop_at_first_solution, |
|
) |
|
return [*history, *new_history], accept |
|
# Expand rules |
|
if len(stack) > 0 and len(input_buffer) > 0 and stack[-1] in grammar_rules: |
|
for production in grammar_rules[stack[-1]]: |
|
if look_ahead is not None: |
|
# Check if the production will at any point match the top element in the input buffer |
|
if ( |
|
production[0] not in look_ahead[0] |
|
and production[0] != input_buffer[0] |
|
): |
|
history.append( |
|
( |
|
stack.copy(), |
|
input_buffer.copy(), |
|
f"look-ahead({input_buffer[0]!r} != {production[0]!r} [{stack[-1]!r} $\\rightarrow$ {' '.join([f'{x!r}' for x in production])}])", |
|
None, |
|
None, |
|
) |
|
) |
|
continue |
|
if ( |
|
production[0] in look_ahead[0] |
|
and input_buffer[0] not in look_ahead[0][production[0]] |
|
): |
|
history.append( |
|
( |
|
stack.copy(), |
|
input_buffer.copy(), |
|
f"look-ahead({input_buffer[0]!r} $\\not\\in$ FIRST({production[0]!r}) [{stack[-1]!r} $\\rightarrow$ {' '.join([f'{x!r}' for x in production])}])", |
|
None, |
|
None, |
|
) |
|
) |
|
continue |
|
new_stack: Stack = [*stack[:-1], *reversed(production)] |
|
derivation_expand: Derivation = (stack[-1], len(production)) |
|
history.append( |
|
( |
|
new_stack, |
|
input_buffer.copy(), |
|
f"expand({stack[-1]!r} $\\rightarrow$ {' '.join([f'{x!r}' for x in production])})", |
|
derivation_expand, |
|
None, |
|
) |
|
) |
|
new_history, accept = top_down_parser_backtrack( |
|
grammar_rules, |
|
input_buffer.copy(), |
|
new_stack, |
|
[], |
|
derivation_history=derivation_history + [derivation_expand], |
|
look_ahead=look_ahead, |
|
stop_at_first_solution=stop_at_first_solution, |
|
) |
|
history.extend(new_history) |
|
if accept and stop_at_first_solution: |
|
return history, True |
|
# Backtrack (if no match or expand was found) |
|
history.append((stack.copy(), input_buffer.copy(), f"backtrack", None, None)) |
|
return history, False |
|
|
|
|
|
def format_history(history: History) -> str: |
|
history_df: Dict[str, List[str]] = {"stack": [], "input": [], "action": []} |
|
for stack, input_buffer, action, derivation, _derivation_history in history: |
|
history_df["stack"].append(" ".join(["$", *stack])) |
|
history_df["input"].append(" ".join([*input_buffer, "$"])) |
|
history_df["action"].append( |
|
action + ("" if derivation is None else f" $\\Rightarrow$ {derivation}") |
|
) |
|
|
|
df = pd.DataFrame(data=history_df) |
|
md2anki_anki = "MD2ANKI_ANKI_HTML" |
|
if md2anki_anki in os.environ and os.environ[md2anki_anki] == str(True): |
|
# Render to HTML for anki notes because otherwise the table looks bad |
|
return df.to_html() |
|
return df.to_markdown(colalign=("right", "left", "right", "left")) |
|
|
|
|
|
def get_parse_trees(history: History) -> List[DerivationTreeNode]: |
|
parse_trees: List[DerivationTreeNode] = [] |
|
for _stack, _input_buffer, action, derivation, derivation_history in history: |
|
if action.startswith("accept") and derivation_history is not None: |
|
parse_tree, _ = build_tree(derivation_history) |
|
if parse_tree is not None: |
|
parse_trees.append(parse_tree) |
|
return parse_trees |
|
|
|
|
|
for [word_str, grammar_str, use_look_ahead] in data: |
|
grammar = get_grammar_rules_from_string(grammar_str) |
|
look_ahead_info = ( |
|
(calculate_first_sets(grammar), calculate_follow_set(grammar)) |
|
if use_look_ahead |
|
else None |
|
) |
|
td_history, _ = top_down_parser_backtrack( |
|
grammar, list(word_str), look_ahead=look_ahead_info, stop_at_first_solution=True |
|
) |
|
grammar_md_list = "".join( |
|
map( |
|
lambda x: f"\n- {x}", |
|
grammar_str.replace("→", "$\\rightarrow$").splitlines(), |
|
) |
|
) |
|
print( |
|
f"**Top-Down Parser(word={word_str!r},look-ahead={use_look_ahead})**:\n\nGrammar:\n{grammar_md_list}\n" |
|
) |
|
if look_ahead_info is not None: |
|
print( |
|
"FIRST sets:\n" |
|
+ "".join( |
|
map( |
|
lambda x: f"\n- FIRST({x[0]}) = " |
|
+ "$\\{$ " |
|
+ ", ".join(sorted(x[1])) |
|
+ " $\\}$", |
|
look_ahead_info[0].items(), |
|
) |
|
) |
|
) |
|
print( |
|
"\nFOLLOW sets (take with a grain of salt - not 100% checked!):\n" |
|
+ "".join( |
|
map( |
|
lambda x: f"\n- FOLLOW({x[0]}) = " |
|
+ "$\\{$ " |
|
+ ", ".join(sorted(x[1])) |
|
+ " $\\}$", |
|
look_ahead_info[1].items(), |
|
) |
|
) |
|
) |
|
print(f"\n{format_history(td_history)}\n") |
|
print( |
|
"Parse trees:\n" |
|
+ "".join(map(lambda x: f"\n- {repr(x)}", get_parse_trees(td_history))) |
|
+ "\n\n" |
|
) |