Skip to content

Instantly share code, notes, and snippets.

@wataash
Last active August 10, 2021 02:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wataash/dc9f8352dc147b223382def86c3a592b to your computer and use it in GitHub Desktop.
Save wataash/dc9f8352dc147b223382def86c3a592b to your computer and use it in GitHub Desktop.
A Python implementation of "Packrat Parsers Can Support Left Recursion" by Alessandro Warth, James R. Douglass, and Todd Millstein. http://www.vpri.org/pdf/tr2007002_packrat.pdf
# SPDX-License-Identifier: Apache-2.0
"""
https://gist.github.com/wataash/dc9f8352dc147b223382def86c3a592b
A Python implementation of "Packrat Parsers Can Support Left Recursion" by
Alessandro Warth, James R. Douglass, and Todd Millstein.
http://www.vpri.org/pdf/tr2007002_packrat.pdf
2. An Overview of Packrat Parsing
"""
import collections
import dataclasses
import inspect
import re
import typing as t
def n_stack() -> int:
return len([None for x in inspect.stack() if x.function == 'EVAL']) - 1
"""
[R.body]:
In the APPLY-RULE() function in the paper, the EVAL() function is invoked as:
EVAL(R.body)
where R is RULE. But I have no idea what the R.body (RULE's body) is --
instead, we define the global variable "BODY", and invoke EVAL() function as:
EVAL(R)
"""
Body = t.NewType('Body', str) # not defined in the paper
# BODY = Body('1234-5')
POS = t.NewType('POS', int) # POS
# Pos = POS(0) # Pos
@dataclasses.dataclass(frozen=True)
class AST:
"""
AST
The data structure is not defined in the paper.
"""
type: str
string: str
leaf_len: int
children: list['AST']
n_stack: int
def __repr__(self):
color = ['\x1b[31m', '\x1b[32m', '\x1b[33m'][self.n_stack % 3]
if self.children:
children = " ".join(str(x) for x in self.children)
return f'{color}<\x1b[0m{self.type}:{self.string} {children}{color}>\x1b[0m'
return f'{color}<\x1b[0m{self.type}:{self.string}{color}>\x1b[0m'
FAIL = AST('FAIL', '', 0, [], 0) # FAIL
@dataclasses.dataclass(frozen=True)
class MemoEntry:
"""MEMOENTRY"""
ans: AST
pos: POS
def __repr__(self):
return f'M[{self.pos} {self.ans}]'
# keep order for better debug-print()
memo_: collections.OrderedDict[tuple['RULE', POS], MemoEntry] = collections.OrderedDict()
# noinspection PyPep8Naming
def MEMO(R: 'RULE', P: POS) -> t.Optional[MemoEntry]:
global memo_
"""MEMO :(RULE, POS) -> MEMOENTRY"""
return memo_.get((R, P))
# noinspection PyPep8Naming
def MEMO_set(R: 'RULE', P: POS, m: MemoEntry):
global memo_
"""MEMO(R, P) <- m"""
assert (R, P) not in memo_
memo_[R, P] = m
memo_str_last: dict[tuple['RULE', POS], str] = {}
def print_memo_if_changed(R: 'RULE', P: POS, indent: str):
global memo_str_last
s = f'{indent} MEMO({R.__name__}, {P}) <- {MEMO(R, P)}'
if s == memo_str_last.get((R, P)):
return
print(s)
memo_str_last[R, P] = s
# def EVAL(body: Body) -> AST: # [R.body]
# noinspection PyPep8Naming
def EVAL(R: 'RULE') -> AST:
"""
EVAL
"""
global BODY, Pos
pos_orig = Pos
indent = ' ' * n_stack()
print(f'\x1b[34m{indent}{R.__name__}: evaluate\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
ast = R()
if ast is FAIL:
assert Pos == pos_orig
print(f'\x1b[34m{indent}{R.__name__}: failed\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
return FAIL
pos2 = Pos + ast.leaf_len
print(f'\x1b[34m{indent}{R.__name__}: matched: {BODY[pos_orig:pos2]}\x1b[0m')
if Pos != pos2:
print(f'{indent} Pos:{Pos}->{pos2} BODY:{BODY[Pos:pos2]}>{BODY[pos2:]}')
print(f'{indent} ast:{ast}')
Pos = pos2
return ast
# noinspection PyPep8Naming
def APPLY_RULE(R: 'RULE', P: POS) -> AST:
"""
Figure 1.
APPLY-RULE(R, P)
"""
global Pos
indent = ' ' * n_stack()
m = MEMO(R, P)
if m is None:
# ans = EVAL(R.body) # [R.body]
ans = EVAL(R) # [R.body]
m = MemoEntry(ans, Pos)
MEMO_set(R, P, m) # MEMO(R, P) <- m
print_memo_if_changed(R, P, indent)
return ans
else:
print(f'\x1b[34m{indent}{R.__name__}: using memo\x1b[0m')
print(f'{indent} Pos:{Pos}->{m.pos} BODY:{BODY[Pos:m.pos]}>{BODY[m.pos:]}')
print(f'{indent} m:{m}')
Pos = m.pos
return m.ans
# noinspection PyPep8Naming
def APPLY_RULE_or_rollback_Pos(R: 'RULE', P: POS, pos_rollback_to: POS) -> AST:
global Pos
ast = APPLY_RULE(R, P)
if ast is FAIL:
Pos = pos_rollback_to
return ast
# -----------------------------------------------------------------------------
# RULE
# The type is defined in the paper
RULE = t.Callable[[], AST]
rule_expr: RULE
rule_num: RULE
rule_plus: RULE
rule_minus: RULE
def rule_expr() -> AST:
"""
expr ::= <num> "+" <num>
/ <num> "-" <num>
"""
def rule1() -> AST:
"""<num> "+" <num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
ast1 = APPLY_RULE_or_rollback_Pos(rule_plus, Pos, pos_orig)
if ast1 == FAIL:
return FAIL
ast2 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast2 == FAIL:
return FAIL
return AST('expr', BODY[pos_orig:Pos], 0, [ast0, ast1, ast2], n_stack())
def rule2() -> AST:
"""<num> "-" <num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
ast1 = APPLY_RULE_or_rollback_Pos(rule_minus, Pos, pos_orig)
if ast1 == FAIL:
return FAIL
ast2 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast2 == FAIL:
return FAIL
return AST('expr', BODY[pos_orig:Pos], 0, [ast0, ast1, ast2], n_stack())
global BODY, Pos
ast = rule1()
if ast != FAIL:
return ast
ast = rule2()
if ast != FAIL:
return ast
return FAIL
def rule_num() -> AST:
"""
<num>
"""
global BODY, Pos
match = re.search(r'^(\d+)', BODY[Pos:])
if match is None:
return FAIL
return AST('num', match[1], len(match[1]), [], n_stack())
def rule_plus() -> AST:
"""
"+"
"""
global BODY, Pos
if BODY[Pos] != '+':
return FAIL
return AST('"+"', '+', len('+'), [], n_stack())
def rule_minus() -> AST:
"""
"-"
"""
global BODY, Pos
if BODY[Pos] != '-':
return FAIL
return AST('"-"', '-', len('-'), [], n_stack())
# -----------------------------------------------------------------------------
# main
if __name__ == '__main__':
BODY = Body('1234-5')
Pos = POS(0) # Pos
pos_orig = Pos
ast = APPLY_RULE_or_rollback_Pos(rule_expr, Pos, pos_orig)
# AST(string='expr', children=[AST(string='1234', children=[]), AST(string='-', children=[]), AST(string='5', children=[])])
print()
print(ast)
# SPDX-License-Identifier: Apache-2.0
"""
https://gist.github.com/wataash/dc9f8352dc147b223382def86c3a592b
A Python implementation of "Packrat Parsers Can Support Left Recursion" by
Alessandro Warth, James R. Douglass, and Todd Millstein.
http://www.vpri.org/pdf/tr2007002_packrat.pdf
3. Adding Support for Left Recursion
"""
import collections
import dataclasses
import inspect
import re
import typing as t
def n_stack() -> int:
return len([None for x in inspect.stack() if x.function == 'EVAL']) - 1
"""
[R.body]:
In the APPLY-RULE() function in the paper, the EVAL() function is invoked as:
EVAL(R.body)
where R is RULE. But I have no idea what the R.body (RULE's body) is --
instead, we define the global variable "BODY", and invoke EVAL() function as:
EVAL(R)
"""
Body = t.NewType('Body', str) # not defined in the paper
# BODY = Body('1234-5')
POS = t.NewType('POS', int) # POS
# Pos = POS(0) # Pos
@dataclasses.dataclass(frozen=True)
class AST:
"""
AST
The data structure is not defined in the paper.
"""
type: str
string: str
leaf_len: int
children: list['AST']
n_stack: int
def __repr__(self):
color = ['\x1b[31m', '\x1b[32m', '\x1b[33m'][self.n_stack % 3]
if self.children:
children = " ".join(str(x) for x in self.children)
return f'{color}<\x1b[0m{self.type}:{self.string} {children}{color}>\x1b[0m'
return f'{color}<\x1b[0m{self.type}:{self.string}{color}>\x1b[0m'
FAIL = AST('FAIL', '', 0, [], 0) # FAIL
@dataclasses.dataclass(frozen=True)
class MemoEntry:
"""MEMOENTRY"""
ans: AST
pos: POS
def __repr__(self):
return f'M[{self.pos} {self.ans}]'
# keep order for better debug-print()
memo_: collections.OrderedDict[tuple['RULE', POS], MemoEntry] = collections.OrderedDict()
# noinspection PyPep8Naming
def MEMO(R: 'RULE', P: POS) -> t.Optional[MemoEntry]:
global memo_
"""MEMO :(RULE, POS) -> MEMOENTRY"""
return memo_.get((R, P))
# noinspection PyPep8Naming
def MEMO_set(R: 'RULE', P: POS, m: MemoEntry):
global memo_
"""MEMO(R, P) <- m"""
assert (R, P) not in memo_
memo_[R, P] = m
memo_str_last: dict[tuple['RULE', POS], str] = {}
def print_memo_if_changed(R: 'RULE', P: POS, indent: str):
global memo_str_last
s = f'{indent} MEMO({R.__name__}, {P}) <- {MEMO(R, P)}'
if s == memo_str_last.get((R, P)):
return
print(s)
memo_str_last[R, P] = s
# def EVAL(body: Body) -> AST: # [R.body]
# noinspection PyPep8Naming
def EVAL(R: 'RULE') -> AST:
"""
EVAL
"""
global BODY, Pos
pos_orig = Pos
indent = ' ' * n_stack()
print(f'\x1b[34m{indent}{R.__name__}: evaluate\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
ast = R()
if ast is FAIL:
assert Pos == pos_orig
print(f'\x1b[34m{indent}{R.__name__}: failed\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
return FAIL
pos2 = Pos + ast.leaf_len
print(f'\x1b[34m{indent}{R.__name__}: matched: {BODY[pos_orig:pos2]}\x1b[0m')
if Pos != pos2:
print(f'{indent} Pos:{Pos}->{pos2} BODY:{BODY[Pos:pos2]}>{BODY[pos2:]}')
print(f'{indent} ast:{ast}')
Pos = pos2
return ast
# noinspection PyPep8Naming
def APPLY_RULE(R: 'RULE', P: POS) -> AST:
"""
Figure 1.
APPLY-RULE(R, P)
"""
global Pos
indent = ' ' * n_stack()
m = MEMO(R, P)
if m is None:
# ans = EVAL(R.body) # [R.body]
ans = EVAL(R) # [R.body]
m = MemoEntry(ans, Pos)
MEMO_set(R, P, m) # MEMO(R, P) <- m
print_memo_if_changed(R, P, indent)
return ans
else:
print(f'\x1b[34m{indent}{R.__name__}: using memo\x1b[0m')
print(f'{indent} Pos:{Pos}->{m.pos} BODY:{BODY[Pos:m.pos]}>{BODY[m.pos:]}')
print(f'{indent} m:{m}')
Pos = m.pos
return m.ans
# noinspection PyPep8Naming
def APPLY_RULE_or_rollback_Pos(R: 'RULE', P: POS, pos_rollback_to: POS) -> AST:
global Pos
ast = APPLY_RULE(R, P)
if ast is FAIL:
Pos = pos_rollback_to
return ast
# -----------------------------------------------------------------------------
# RULE
# The type is defined in the paper
RULE = t.Callable[[], AST]
rule_expr: RULE
rule_num: RULE
rule_plus: RULE
rule_minus: RULE
def rule_expr() -> AST:
"""
expr ::= <expr> "-" <num> / <num>
"""
def rule1() -> AST:
"""<expr> "-" <num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_expr, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
ast1 = APPLY_RULE_or_rollback_Pos(rule_minus, Pos, pos_orig)
if ast1 == FAIL:
return FAIL
ast2 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast2 == FAIL:
return FAIL
return AST('expr', BODY[pos_orig:Pos], 0, [ast0, ast1, ast2], n_stack())
def rule2() -> AST:
"""<num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
return AST('expr', BODY[pos_orig:Pos], 0, [ast0], n_stack())
global BODY, Pos
ast = rule1()
if ast != FAIL:
return ast
ast = rule2()
if ast != FAIL:
return ast
return FAIL
def rule_num() -> AST:
"""
<num>
"""
global BODY, Pos
match = re.search(r'^(\d+)', BODY[Pos:])
if match is None:
return FAIL
return AST('num', match[1], len(match[1]), [], n_stack())
def rule_plus() -> AST:
"""
"+"
"""
global BODY, Pos
if BODY[Pos] != '+':
return FAIL
return AST('"+"', '+', len('+'), [], n_stack())
def rule_minus() -> AST:
"""
"-"
"""
global BODY, Pos
if BODY[Pos] != '-':
return FAIL
return AST('"-"', '-', len('-'), [], n_stack())
# -----------------------------------------------------------------------------
# main
if __name__ == '__main__':
BODY = Body('1-2-3')
Pos = POS(0) # Pos
pos_orig = Pos
# > The parser is doomed to repeat the same steps forever, or more
# > precisely, until the computer eventually runs out of the stack space.
# RecursionError
ast = APPLY_RULE_or_rollback_Pos(rule_expr, Pos, pos_orig)
print()
print(ast)
# SPDX-License-Identifier: Apache-2.0
"""
https://gist.github.com/wataash/dc9f8352dc147b223382def86c3a592b
A Python implementation of "Packrat Parsers Can Support Left Recursion" by
Alessandro Warth, James R. Douglass, and Todd Millstein.
http://www.vpri.org/pdf/tr2007002_packrat.pdf
3.1 Avoiding Infinite Recursion in Left-Recursive Rules
"""
import collections
import dataclasses
import inspect
import re
import typing as t
def n_stack() -> int:
return len([None for x in inspect.stack() if x.function == 'EVAL']) - 1
"""
[R.body]:
In the APPLY-RULE() function in the paper, the EVAL() function is invoked as:
EVAL(R.body)
where R is RULE. But I have no idea what the R.body (RULE's body) is --
instead, we define the global variable "BODY", and invoke EVAL() function as:
EVAL(R)
"""
Body = t.NewType('Body', str) # not defined in the paper
# BODY = Body('1234-5')
POS = t.NewType('POS', int) # POS
# Pos = POS(0) # Pos
@dataclasses.dataclass(frozen=True)
class AST:
"""
AST
The data structure is not defined in the paper.
"""
type: str
string: str
leaf_len: int
children: list['AST']
n_stack: int
def __repr__(self):
color = ['\x1b[31m', '\x1b[32m', '\x1b[33m'][self.n_stack % 3]
if self.children:
children = " ".join(str(x) for x in self.children)
return f'{color}<\x1b[0m{self.type}:{self.string} {children}{color}>\x1b[0m'
return f'{color}<\x1b[0m{self.type}:{self.string}{color}>\x1b[0m'
FAIL = AST('FAIL', '', 0, [], 0) # FAIL
@dataclasses.dataclass(frozen=False)
class MemoEntry:
"""MEMOENTRY"""
ans: AST
pos: POS
# keep order for better debug-print()
memo_: collections.OrderedDict[tuple['RULE', POS], MemoEntry] = collections.OrderedDict()
# noinspection PyPep8Naming
def MEMO(R: 'RULE', P: POS) -> t.Optional[MemoEntry]:
global memo_
"""MEMO :(RULE, POS) -> MEMOENTRY"""
return memo_.get((R, P))
# noinspection PyPep8Naming
def MEMO_set(R: 'RULE', P: POS, m: MemoEntry):
global memo_
"""MEMO(R, P) <- m"""
assert (R, P) not in memo_
memo_[R, P] = m
memo_str_last: dict[tuple['RULE', POS], str] = {}
def print_memo_if_changed(R: 'RULE', P: POS, indent: str):
global memo_str_last
s = f'{indent} MEMO({R.__name__}, {P}) <- {MEMO(R, P)}'
if s == memo_str_last.get((R, P)):
return
print(s)
memo_str_last[R, P] = s
# def EVAL(body: Body) -> AST: # [R.body]
# noinspection PyPep8Naming
def EVAL(R: 'RULE') -> AST:
"""
EVAL
"""
global BODY, Pos
pos_orig = Pos
indent = ' ' * n_stack()
print(f'\x1b[34m{indent}{R.__name__}: evaluate\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
ast = R()
if ast is FAIL:
assert Pos == pos_orig
print(f'\x1b[34m{indent}{R.__name__}: failed\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
return FAIL
pos2 = Pos + ast.leaf_len
print(f'\x1b[34m{indent}{R.__name__}: matched: {BODY[pos_orig:pos2]}\x1b[0m')
if Pos != pos2:
print(f'{indent} Pos:{Pos}->{pos2} BODY:{BODY[Pos:pos2]}>{BODY[pos2:]}')
print(f'{indent} ast:{ast}')
Pos = pos2
return ast
# noinspection PyPep8Naming
def APPLY_RULE(R: 'RULE', P: POS) -> AST:
"""
Figure 2.
APPLY-RULE(R, P)
"""
global Pos
indent = ' ' * n_stack()
m = MEMO(R, P)
if m is None:
m = MemoEntry(FAIL, P)
MEMO_set(R, P, m)
print_memo_if_changed(R, P, indent)
# ans = EVAL(R.body) # [R.body]
ans = EVAL(R) # [R.body]
m.ans = ans
m.pos = Pos
print_memo_if_changed(R, P, indent)
return ans
else:
print(f'\x1b[34m{indent}{R.__name__}: using memo\x1b[0m')
print(f'{indent} Pos:{Pos}->{m.pos} BODY:{BODY[Pos:m.pos]}>{BODY[m.pos:]}')
print(f'{indent} m:{m}')
Pos = m.pos
return m.ans
# noinspection PyPep8Naming
def APPLY_RULE_or_rollback_Pos(R: 'RULE', P: POS, pos_rollback_to: POS) -> AST:
global Pos
ast = APPLY_RULE(R, P)
if ast is FAIL:
Pos = pos_rollback_to
return ast
# -----------------------------------------------------------------------------
# RULE
# The type is defined in the paper
RULE = t.Callable[[], AST]
rule_expr: RULE
rule_num: RULE
rule_plus: RULE
rule_minus: RULE
def rule_expr() -> AST:
"""
expr ::= <expr> "-" <num> / <num>
"""
def rule1() -> AST:
"""<expr> "-" <num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_expr, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
ast1 = APPLY_RULE_or_rollback_Pos(rule_minus, Pos, pos_orig)
if ast1 == FAIL:
return FAIL
ast2 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast2 == FAIL:
return FAIL
return AST('expr', BODY[pos_orig:Pos], 0, [ast0, ast1, ast2], n_stack())
def rule2() -> AST:
"""<num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
return AST('expr', BODY[pos_orig:Pos], 0, [ast0], n_stack())
global BODY, Pos
ast = rule1()
if ast != FAIL:
return ast
ast = rule2()
if ast != FAIL:
return ast
return FAIL
def rule_num() -> AST:
"""
<num>
"""
global BODY, Pos
match = re.search(r'^(\d+)', BODY[Pos:])
if match is None:
return FAIL
return AST('num', match[1], len(match[1]), [], n_stack())
def rule_plus() -> AST:
"""
"+"
"""
global BODY, Pos
if BODY[Pos] != '+':
return FAIL
return AST('"+"', '+', len('+'), [], n_stack())
def rule_minus() -> AST:
"""
"-"
"""
global BODY, Pos
if BODY[Pos] != '-':
return FAIL
return AST('"-"', '-', len('-'), [], n_stack())
# -----------------------------------------------------------------------------
# main
if __name__ == '__main__':
BODY = Body('1-2-3')
Pos = POS(0) # Pos
pos_orig = Pos
# > The parser will then move on to the second choice, <num>, which will
# > succeed after consuming the "1", and leave the rest of the input,
# > "-2-3", unprocessed.
ast = APPLY_RULE_or_rollback_Pos(rule_expr, Pos, pos_orig)
print()
print(ast)
# SPDX-License-Identifier: Apache-2.0
"""
https://gist.github.com/wataash/dc9f8352dc147b223382def86c3a592b
A Python implementation of "Packrat Parsers Can Support Left Recursion" by
Alessandro Warth, James R. Douglass, and Todd Millstein.
http://www.vpri.org/pdf/tr2007002_packrat.pdf
3.2 Supporting Direct Left Recursion
"""
import collections
import dataclasses
import inspect
import re
import typing as t
def n_stack() -> int:
return len([None for x in inspect.stack() if x.function in ['EVAL', 'GROW_LR']]) - 1
"""
[R.body]:
In the APPLY-RULE() function in the paper, the EVAL() function is invoked as:
EVAL(R.body)
where R is RULE. But I have no idea what the R.body (RULE's body) is --
instead, we define the global variable "BODY", and invoke EVAL() function as:
EVAL(R)
"""
Body = t.NewType('Body', str) # not defined in the paper
# BODY = Body('1234-5')
POS = t.NewType('POS', int) # POS
# Pos = POS(0) # Pos
@dataclasses.dataclass(frozen=True)
class AST:
"""
AST
The data structure is not defined in the paper.
"""
type: str
string: str
leaf_len: int
children: list['AST']
n_stack: int
def __repr__(self):
color = ['\x1b[31m', '\x1b[32m', '\x1b[33m'][self.n_stack % 3]
if self.children:
children = " ".join(str(x) for x in self.children)
return f'{color}<\x1b[0m{self.type}:{self.string} {children}{color}>\x1b[0m'
return f'{color}<\x1b[0m{self.type}:{self.string}{color}>\x1b[0m'
FAIL = AST('FAIL', '', 0, [], 0) # FAIL
@dataclasses.dataclass(frozen=False)
class LR:
"""LR"""
detected: bool
@dataclasses.dataclass(frozen=False)
class MemoEntry:
"""MEMOENTRY"""
ans: t.Union[AST, LR]
pos: POS
# keep order for better debug-print()
memo_: collections.OrderedDict[tuple['RULE', POS], MemoEntry] = collections.OrderedDict()
# noinspection PyPep8Naming
def MEMO(R: 'RULE', P: POS) -> t.Optional[MemoEntry]:
global memo_
"""MEMO :(RULE, POS) -> MEMOENTRY"""
return memo_.get((R, P))
# noinspection PyPep8Naming
def MEMO_set(R: 'RULE', P: POS, m: MemoEntry):
global memo_
"""MEMO(R, P) <- m"""
assert (R, P) not in memo_
memo_[R, P] = m
memo_str_last: dict[tuple['RULE', POS], str] = {}
def print_memo_if_changed(R: 'RULE', P: POS, indent: str):
global memo_str_last
s = f'{indent} MEMO({R.__name__}, {P}) <- {MEMO(R, P)}'
if s == memo_str_last.get((R, P)):
return
print(s)
memo_str_last[R, P] = s
# def EVAL(body: Body) -> AST: # [R.body]
# noinspection PyPep8Naming
def EVAL(R: 'RULE') -> AST:
"""
EVAL
"""
global BODY, Pos
pos_orig = Pos
indent = ' ' * n_stack()
print(f'\x1b[34m{indent}{R.__name__}: evaluate\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
ast = R()
if ast is FAIL:
assert Pos == pos_orig
print(f'\x1b[34m{indent}{R.__name__}: failed\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
return FAIL
pos2 = Pos + ast.leaf_len
print(f'\x1b[34m{indent}{R.__name__}: matched: {BODY[pos_orig:pos2]}\x1b[0m')
if Pos != pos2:
print(f'{indent} Pos:{Pos}->{pos2} BODY:{BODY[Pos:pos2]}>{BODY[pos2:]}')
print(f'{indent} ast:{ast}')
Pos = pos2
return ast
# noinspection PyPep8Naming
def APPLY_RULE(R: 'RULE', P: POS) -> AST:
"""
Figure 4.
APPLY-RULE(R, P)
"""
global Pos
indent = ' ' * n_stack()
m = MEMO(R, P)
if m is None:
lr = LR(False)
m = MemoEntry(lr, P)
MEMO_set(R, P, m)
print_memo_if_changed(R, P, indent)
# ans = EVAL(R.body) # [R.body]
ans = EVAL(R) # [R.body]
m.ans = ans
m.pos = Pos
print_memo_if_changed(R, P, indent)
if lr.detected and ans is not FAIL:
print(f'\x1b[34m{indent} GROW-LR\x1b[0m')
return GROW_LR(R, P, m, None)
else:
return ans
else:
print(f'\x1b[34m{indent}{R.__name__}: using memo\x1b[0m')
print(f'{indent} Pos:{Pos}->{m.pos} BODY:{BODY[Pos:m.pos]}>{BODY[m.pos:]}')
print(f'{indent} m:{m}')
Pos = m.pos
if isinstance(m.ans, LR):
m.ans.detected = True
print_memo_if_changed(R, P, indent)
return FAIL
return m.ans
# noinspection PyPep8Naming
def APPLY_RULE_or_rollback_Pos(R: 'RULE', P: POS, pos_rollback_to: POS) -> AST:
global Pos
ast = APPLY_RULE(R, P)
if ast is FAIL:
Pos = pos_rollback_to
return ast
# noinspection PyPep8Naming
def GROW_LR(R: 'RULE', P: POS, M: MemoEntry, H: 'unknown') -> AST:
"""
Figure 3.
GROW-LR
"""
global Pos
# ... # line A
while True:
Pos = P
# ... # line B
# ans = EVAL(R.body) # [R.body]
ans = EVAL(R) # [R.body]
if ans is FAIL or Pos <= M.pos:
break
M.ans = ans
M.pos = Pos
# ... # line C
Pos = M.pos
return M.ans
# -----------------------------------------------------------------------------
# RULE
# The type is defined in the paper
RULE = t.Callable[[], AST]
rule_expr: RULE
rule_num: RULE
rule_plus: RULE
rule_minus: RULE
def rule_expr() -> AST:
"""
expr ::= <expr> "-" <num> / <num>
"""
def rule1() -> AST:
"""<expr> "-" <num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_expr, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
ast1 = APPLY_RULE_or_rollback_Pos(rule_minus, Pos, pos_orig)
if ast1 == FAIL:
return FAIL
ast2 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast2 == FAIL:
return FAIL
return AST('expr', BODY[pos_orig:Pos], 0, [ast0, ast1, ast2], n_stack())
def rule2() -> AST:
"""<num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
return AST('expr', BODY[pos_orig:Pos], 0, [ast0], n_stack())
global BODY, Pos
ast = rule1()
if ast != FAIL:
return ast
ast = rule2()
if ast != FAIL:
return ast
return FAIL
def rule_num() -> AST:
"""
<num>
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
match = re.search(r'^(\d+)', BODY[Pos:])
if match is None:
return FAIL
return AST('num', match[1], len(match[1]), [], n_stack())
def rule_plus() -> AST:
"""
"+"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '+':
return FAIL
return AST('"+"', '+', len('+'), [], n_stack())
def rule_minus() -> AST:
"""
"-"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '-':
return FAIL
return AST('"-"', '-', len('-'), [], n_stack())
# -----------------------------------------------------------------------------
# main
if __name__ == '__main__':
BODY = Body('1-2-3')
Pos = POS(0) # Pos
pos_orig = Pos
ast = APPLY_RULE_or_rollback_Pos(rule_expr, Pos, pos_orig)
print()
print(ast)
# SPDX-License-Identifier: Apache-2.0
"""
https://gist.github.com/wataash/dc9f8352dc147b223382def86c3a592b
A Python implementation of "Packrat Parsers Can Support Left Recursion" by
Alessandro Warth, James R. Douglass, and Todd Millstein.
http://www.vpri.org/pdf/tr2007002_packrat.pdf
3.2 Supporting Direct Left Recursion
"""
import collections
import dataclasses
import inspect
import re
import typing as t
def n_stack() -> int:
return len([None for x in inspect.stack() if x.function in ['EVAL', 'GROW_LR']]) - 1
"""
[R.body]:
In the APPLY-RULE() function in the paper, the EVAL() function is invoked as:
EVAL(R.body)
where R is RULE. But I have no idea what the R.body (RULE's body) is --
instead, we define the global variable "BODY", and invoke EVAL() function as:
EVAL(R)
"""
Body = t.NewType('Body', str) # not defined in the paper
# BODY = Body('1234-5')
POS = t.NewType('POS', int) # POS
# Pos = POS(0) # Pos
@dataclasses.dataclass(frozen=True)
class AST:
"""
AST
The data structure is not defined in the paper.
"""
type: str
string: str
leaf_len: int
children: list['AST']
n_stack: int
def __repr__(self):
color = ['\x1b[31m', '\x1b[32m', '\x1b[33m'][self.n_stack % 3]
if self.children:
children = " ".join(str(x) for x in self.children)
return f'{color}<\x1b[0m{self.type}:{self.string} {children}{color}>\x1b[0m'
return f'{color}<\x1b[0m{self.type}:{self.string}{color}>\x1b[0m'
FAIL = AST('FAIL', '', 0, [], 0) # FAIL
@dataclasses.dataclass(frozen=False)
class LR:
"""LR"""
detected: bool
@dataclasses.dataclass(frozen=False)
class MemoEntry:
"""MEMOENTRY"""
ans: t.Union[AST, LR]
pos: POS
# keep order for better debug-print()
memo_: collections.OrderedDict[tuple['RULE', POS], MemoEntry] = collections.OrderedDict()
# noinspection PyPep8Naming
def MEMO(R: 'RULE', P: POS) -> t.Optional[MemoEntry]:
global memo_
"""MEMO :(RULE, POS) -> MEMOENTRY"""
return memo_.get((R, P))
# noinspection PyPep8Naming
def MEMO_set(R: 'RULE', P: POS, m: MemoEntry):
global memo_
"""MEMO(R, P) <- m"""
assert (R, P) not in memo_
memo_[R, P] = m
memo_str_last: dict[tuple['RULE', POS], str] = {}
def print_memo_if_changed(R: 'RULE', P: POS, indent: str):
global memo_str_last
s = f'{indent} MEMO({R.__name__}, {P}) <- {MEMO(R, P)}'
if s == memo_str_last.get((R, P)):
return
print(s)
memo_str_last[R, P] = s
# def EVAL(body: Body) -> AST: # [R.body]
# noinspection PyPep8Naming
def EVAL(R: 'RULE') -> AST:
"""
EVAL
"""
global BODY, Pos
pos_orig = Pos
indent = ' ' * n_stack()
print(f'\x1b[34m{indent}{R.__name__}: evaluate\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
ast = R()
if ast is FAIL:
assert Pos == pos_orig
print(f'\x1b[34m{indent}{R.__name__}: failed\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
return FAIL
pos2 = Pos + ast.leaf_len
print(f'\x1b[34m{indent}{R.__name__}: matched: {BODY[pos_orig:pos2]}\x1b[0m')
if Pos != pos2:
print(f'{indent} Pos:{Pos}->{pos2} BODY:{BODY[Pos:pos2]}>{BODY[pos2:]}')
print(f'{indent} ast:{ast}')
Pos = pos2
return ast
# noinspection PyPep8Naming
def APPLY_RULE(R: 'RULE', P: POS) -> AST:
"""
Figure 4.
APPLY-RULE(R, P)
"""
global Pos
indent = ' ' * n_stack()
m = MEMO(R, P)
if m is None:
lr = LR(False)
m = MemoEntry(lr, P)
MEMO_set(R, P, m)
print_memo_if_changed(R, P, indent)
# ans = EVAL(R.body) # [R.body]
ans = EVAL(R) # [R.body]
m.ans = ans
m.pos = Pos
print_memo_if_changed(R, P, indent)
if lr.detected and ans is not FAIL:
print(f'\x1b[34m{indent} GROW-LR\x1b[0m')
return GROW_LR(R, P, m, None)
else:
return ans
else:
print(f'\x1b[34m{indent}{R.__name__}: using memo\x1b[0m')
print(f'{indent} Pos:{Pos}->{m.pos} BODY:{BODY[Pos:m.pos]}>{BODY[m.pos:]}')
print(f'{indent} m:{m}')
Pos = m.pos
if isinstance(m.ans, LR):
m.ans.detected = True
print_memo_if_changed(R, P, indent)
return FAIL
return m.ans
# noinspection PyPep8Naming
def APPLY_RULE_or_rollback_Pos(R: 'RULE', P: POS, pos_rollback_to: POS) -> AST:
global Pos
ast = APPLY_RULE(R, P)
if ast is FAIL:
Pos = pos_rollback_to
return ast
# noinspection PyPep8Naming
def GROW_LR(R: 'RULE', P: POS, M: MemoEntry, H: 'unknown') -> AST:
"""
Figure 3.
GROW-LR
"""
global Pos
# ... # line A
while True:
Pos = P
# ... # line B
# ans = EVAL(R.body) # [R.body]
ans = EVAL(R) # [R.body]
if ans is FAIL or Pos <= M.pos:
break
M.ans = ans
M.pos = Pos
# ... # line C
Pos = M.pos
return M.ans
# -----------------------------------------------------------------------------
# RULE
# The type is defined in the paper
RULE = t.Callable[[], AST]
rule_expr: RULE
rule_num: RULE
rule_plus: RULE
rule_minus: RULE
def rule_term() -> AST:
"""
term ::= <term> "+" <fact>
/ <term> "-" <fact>
/ <fact>
"""
def rule1() -> AST:
"""<term> "+" <fact>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_term, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
ast1 = APPLY_RULE_or_rollback_Pos(rule_plus, Pos, pos_orig)
if ast1 == FAIL:
return FAIL
ast2 = APPLY_RULE_or_rollback_Pos(rule_fact, Pos, pos_orig)
if ast2 == FAIL:
return FAIL
return AST('term', BODY[pos_orig:Pos], 0, [ast0, ast1, ast2], n_stack())
def rule2() -> AST:
"""<term> "-" <fact>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_term, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
ast1 = APPLY_RULE_or_rollback_Pos(rule_minus, Pos, pos_orig)
if ast1 == FAIL:
return FAIL
ast2 = APPLY_RULE_or_rollback_Pos(rule_fact, Pos, pos_orig)
if ast2 == FAIL:
return FAIL
return AST('term', BODY[pos_orig:Pos], 0, [ast0, ast1, ast2], n_stack())
def rule3() -> AST:
"""<num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_fact, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
return AST('term', BODY[pos_orig:Pos], 0, [ast0], n_stack())
global BODY, Pos
ast = rule1()
if ast != FAIL:
return ast
ast = rule2()
if ast != FAIL:
return ast
ast = rule3()
if ast != FAIL:
return ast
return FAIL
def rule_fact() -> AST:
"""
fact ::= <fact> "*" <num>
/ <fact> "/" <num>
/ <num>
"""
def rule1() -> AST:
"""<fact> "*" <num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_fact, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
ast1 = APPLY_RULE_or_rollback_Pos(rule_mul, Pos, pos_orig)
if ast1 == FAIL:
return FAIL
ast2 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast2 == FAIL:
return FAIL
return AST('fact', BODY[pos_orig:Pos], 0, [ast0, ast1, ast2], n_stack())
def rule2() -> AST:
"""<fact> "/" <num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_fact, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
ast1 = APPLY_RULE_or_rollback_Pos(rule_div, Pos, pos_orig)
if ast1 == FAIL:
return FAIL
ast2 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast2 == FAIL:
return FAIL
return AST('fact', BODY[pos_orig:Pos], 0, [ast0, ast1, ast2], n_stack())
def rule3() -> AST:
"""<num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
return AST('fact', BODY[pos_orig:Pos], 0, [ast0], n_stack())
global BODY, Pos
ast = rule1()
if ast != FAIL:
return ast
ast = rule2()
if ast != FAIL:
return ast
ast = rule3()
if ast != FAIL:
return ast
return FAIL
def rule_num() -> AST:
"""
<num>
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
match = re.search(r'^(\d+)', BODY[Pos:])
if match is None:
return FAIL
return AST('<num>', match[1], len(match[1]), [], n_stack())
def rule_plus() -> AST:
"""
"+"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '+':
return FAIL
return AST('"+"', '+', len('+'), [], n_stack())
def rule_minus() -> AST:
"""
"-"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '-':
return FAIL
return AST('"-"', '-', len('-'), [], n_stack())
def rule_mul() -> AST:
"""
"*"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '*':
return FAIL
return AST('"*"', '*', len('*'), [], n_stack())
def rule_div() -> AST:
"""
"/"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '/':
return FAIL
return AST('"/"', '/', len('/'), [], n_stack())
# -----------------------------------------------------------------------------
# main
if __name__ == '__main__':
BODY = Body('1-2/3')
Pos = POS(0) # Pos
pos_orig = Pos
ast = APPLY_RULE_or_rollback_Pos(rule_term, Pos, pos_orig)
print()
print(ast)
# SPDX-License-Identifier: Apache-2.0
"""
https://gist.github.com/wataash/dc9f8352dc147b223382def86c3a592b
A Python implementation of "Packrat Parsers Can Support Left Recursion" by
Alessandro Warth, James R. Douglass, and Todd Millstein.
http://www.vpri.org/pdf/tr2007002_packrat.pdf
3.3 Getting Ready For Indirect Left Recursion
"""
import collections
import dataclasses
import inspect
import re
import typing as t
def n_stack() -> int:
return len([None for x in inspect.stack() if x.function in ['EVAL', 'GROW_LR']]) - 1
"""
[R.body]:
In the APPLY-RULE() function in the paper, the EVAL() function is invoked as:
EVAL(R.body)
where R is RULE. But I have no idea what the R.body (RULE's body) is --
instead, we define the global variable "BODY", and invoke EVAL() function as:
EVAL(R)
"""
Body = t.NewType('Body', str) # not defined in the paper
# BODY = Body('1234-5')
POS = t.NewType('POS', int) # POS
# Pos = POS(0) # Pos
@dataclasses.dataclass(frozen=True)
class AST:
"""
AST
The data structure is not defined in the paper.
"""
type: str
string: str
leaf_len: int
children: list['AST']
n_stack: int
def __repr__(self):
color = ['\x1b[31m', '\x1b[32m', '\x1b[33m'][self.n_stack % 3]
if self.children:
children = " ".join(str(x) for x in self.children)
return f'{color}<\x1b[0m{self.type}:{self.string} {children}{color}>\x1b[0m'
return f'{color}<\x1b[0m{self.type}:{self.string}{color}>\x1b[0m'
FAIL = AST('FAIL', '', 0, [], 0) # FAIL
@dataclasses.dataclass(frozen=False)
class LR:
"""LR"""
detected: bool
@dataclasses.dataclass(frozen=False)
class MemoEntry:
"""MEMOENTRY"""
ans: t.Union[AST, LR]
pos: POS
# keep order for better debug-print()
memo_: collections.OrderedDict[tuple['RULE', POS], MemoEntry] = collections.OrderedDict()
# noinspection PyPep8Naming
def MEMO(R: 'RULE', P: POS) -> t.Optional[MemoEntry]:
global memo_
"""MEMO :(RULE, POS) -> MEMOENTRY"""
return memo_.get((R, P))
# noinspection PyPep8Naming
def MEMO_set(R: 'RULE', P: POS, m: MemoEntry):
global memo_
"""MEMO(R, P) <- m"""
assert (R, P) not in memo_
memo_[R, P] = m
memo_str_last: dict[tuple['RULE', POS], str] = {}
def print_memo_if_changed(R: 'RULE', P: POS, indent: str):
global memo_str_last
s = f'{indent} MEMO({R.__name__}, {P}) <- {MEMO(R, P)}'
if s == memo_str_last.get((R, P)):
return
print(s)
memo_str_last[R, P] = s
# def EVAL(body: Body) -> AST: # [R.body]
# noinspection PyPep8Naming
def EVAL(R: 'RULE') -> AST:
"""
EVAL
"""
global BODY, Pos
pos_orig = Pos
indent = ' ' * n_stack()
print(f'\x1b[34m{indent}{R.__name__}: evaluate\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
ast = R()
if ast is FAIL:
assert Pos == pos_orig
print(f'\x1b[34m{indent}{R.__name__}: failed\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
return FAIL
pos2 = Pos + ast.leaf_len
print(f'\x1b[34m{indent}{R.__name__}: matched: {BODY[pos_orig:pos2]}\x1b[0m')
if Pos != pos2:
print(f'{indent} Pos:{Pos}->{pos2} BODY:{BODY[Pos:pos2]}>{BODY[pos2:]}')
print(f'{indent} ast:{ast}')
Pos = pos2
return ast
# noinspection PyPep8Naming
def APPLY_RULE(R: 'RULE', P: POS) -> AST:
"""
Figure 4.
APPLY-RULE(R, P)
"""
global Pos
indent = ' ' * n_stack()
m = MEMO(R, P)
if m is None:
lr = LR(False)
m = MemoEntry(lr, P)
MEMO_set(R, P, m)
print_memo_if_changed(R, P, indent)
# ans = EVAL(R.body) # [R.body]
ans = EVAL(R) # [R.body]
m.ans = ans
m.pos = Pos
print_memo_if_changed(R, P, indent)
if lr.detected and ans is not FAIL:
print(f'\x1b[34m{indent} GROW-LR\x1b[0m')
return GROW_LR(R, P, m, None)
else:
return ans
else:
print(f'\x1b[34m{indent}{R.__name__}: using memo\x1b[0m')
print(f'{indent} Pos:{Pos}->{m.pos} BODY:{BODY[Pos:m.pos]}>{BODY[m.pos:]}')
print(f'{indent} m:{m}')
Pos = m.pos
if isinstance(m.ans, LR):
m.ans.detected = True
print_memo_if_changed(R, P, indent)
return FAIL
return m.ans
# noinspection PyPep8Naming
def APPLY_RULE_or_rollback_Pos(R: 'RULE', P: POS, pos_rollback_to: POS) -> AST:
global Pos
ast = APPLY_RULE(R, P)
if ast is FAIL:
Pos = pos_rollback_to
return ast
# noinspection PyPep8Naming
def GROW_LR(R: 'RULE', P: POS, M: MemoEntry, H: 'unknown') -> AST:
"""
Figure 3.
GROW-LR
"""
global Pos
# ... # line A
while True:
Pos = P
# ... # line B
# ans = EVAL(R.body) # [R.body]
ans = EVAL(R) # [R.body]
if ans is FAIL or Pos <= M.pos:
break
M.ans = ans
M.pos = Pos
# ... # line C
Pos = M.pos
return M.ans
# -----------------------------------------------------------------------------
# RULE
# The type is defined in the paper
RULE = t.Callable[[], AST]
rule_expr: RULE
rule_num: RULE
rule_plus: RULE
rule_minus: RULE
def rule_x() -> AST:
"""
x ::= <expr>
"""
global BODY, Pos
pos_orig = Pos
ast = APPLY_RULE_or_rollback_Pos(rule_expr, Pos, pos_orig)
if ast == FAIL:
return ast
return AST('x', BODY[pos_orig:Pos], 0, [ast], n_stack())
def rule_expr() -> AST:
"""
expr ::= <x> "-" <num> / <num>
"""
def rule1() -> AST:
"""<fact> "*" <num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_x, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
ast1 = APPLY_RULE_or_rollback_Pos(rule_minus, Pos, pos_orig)
if ast1 == FAIL:
return FAIL
ast2 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast2 == FAIL:
return FAIL
return AST('expr', BODY[pos_orig:Pos], 0, [ast0, ast1, ast2], n_stack())
def rule2() -> AST:
"""<fact> "/" <num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
return AST('expr', BODY[pos_orig:Pos], 0, [ast0], n_stack())
global BODY, Pos
ast = rule1()
if ast != FAIL:
return ast
ast = rule2()
if ast != FAIL:
return ast
return FAIL
def rule_num() -> AST:
"""
<num>
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
match = re.search(r'^(\d+)', BODY[Pos:])
if match is None:
return FAIL
return AST('num', match[1], len(match[1]), [], n_stack())
def rule_plus() -> AST:
"""
"+"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '+':
return FAIL
return AST('"+"', '+', len('+'), [], n_stack())
def rule_minus() -> AST:
"""
"-"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '-':
return FAIL
return AST('"-"', '-', len('-'), [], n_stack())
def rule_mul() -> AST:
"""
"*"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '*':
return FAIL
return AST('"*"', '*', len('*'), [], n_stack())
def rule_div() -> AST:
"""
"/"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '/':
return FAIL
return AST('"/"', '/', len('/'), [], n_stack())
# -----------------------------------------------------------------------------
# main
if __name__ == '__main__':
BODY = Body('4-3')
Pos = POS(0) # Pos
pos_orig = Pos
# > This is clearly not the behavior we wanted.
ast = APPLY_RULE_or_rollback_Pos(rule_x, Pos, pos_orig)
print()
print(ast)
# SPDX-License-Identifier: Apache-2.0
"""
https://gist.github.com/wataash/dc9f8352dc147b223382def86c3a592b
A Python implementation of "Packrat Parsers Can Support Left Recursion" by
Alessandro Warth, James R. Douglass, and Todd Millstein.
http://www.vpri.org/pdf/tr2007002_packrat.pdf
3.4 Adding Support for Left Recursion
"""
import collections
import dataclasses
import inspect
import re
import typing as t
def n_stack() -> int:
return len([None for x in inspect.stack() if x.function in ['EVAL', 'GROW_LR']]) - 1
"""
[R.body]:
In the APPLY-RULE() function in the paper, the EVAL() function is invoked as:
EVAL(R.body)
where R is RULE. But I have no idea what the R.body (RULE's body) is --
instead, we define the global variable "BODY", and invoke EVAL() function as:
EVAL(R)
"""
Body = t.NewType('Body', str) # not defined in the paper
# BODY = Body('1234-5')
POS = t.NewType('POS', int) # POS
# Pos = POS(0) # Pos
@dataclasses.dataclass(frozen=True)
class AST:
"""
AST
The data structure is not defined in the paper.
"""
type: str
string: str
leaf_len: int
children: list['AST']
n_stack: int
def __repr__(self):
color = ['\x1b[31m', '\x1b[32m', '\x1b[33m'][self.n_stack % 3]
if self.children:
children = " ".join(str(x) for x in self.children)
return f'{color}<\x1b[0m{self.type}:{self.string} {children}{color}>\x1b[0m'
return f'{color}<\x1b[0m{self.type}:{self.string}{color}>\x1b[0m'
FAIL = AST('FAIL', '', 0, [], 0) # FAIL
@dataclasses.dataclass(frozen=False)
class HEAD:
"""HEAD"""
rule: 'RULE'
involvedSet: set['RULE']
evalSet: set['RULE']
def __repr__(self):
return f'HEAD[{self.rule.__name__} {{{" ".join([x.__name__ for x in self.involvedSet])}}} {{{" ".join([x.__name__ for x in self.evalSet])}}}]'
@dataclasses.dataclass(frozen=False)
class LR:
"""LR"""
seed: AST
rule: 'RULE'
head: t.Optional[HEAD]
next: 'LR'
def __repr__(self):
l = self.next
n_next = 0 # excluding 'unknown_stack_bottom_value'
while l != 'unknown_stack_bottom_value':
l = l.next
n_next += 1
return f'\x1b[35mLR[\x1b[0m{self.seed} {self.rule.__name__} {self.head} {n_next}\x1b[35m]\x1b[0m'
@dataclasses.dataclass(frozen=False)
class MemoEntry:
"""MEMOENTRY"""
ans: t.Union[AST, LR]
pos: POS
def __repr__(self):
return f'M[{self.pos} {self.ans}]'
# keep order for better debug-print()
memo_: collections.OrderedDict[tuple['RULE', POS], MemoEntry] = collections.OrderedDict()
# noinspection PyPep8Naming
def MEMO(R: 'RULE', P: POS) -> t.Optional[MemoEntry]:
"""MEMO :(RULE, POS) -> MEMOENTRY"""
global memo_
return memo_.get((R, P))
# noinspection PyPep8Naming
def MEMO_set(R: 'RULE', P: POS, m: MemoEntry):
"""MEMO(R, P) <- m"""
global memo_
assert (R, P) not in memo_
memo_[R, P] = m
memo_str_last: dict[tuple['RULE', POS], str] = {}
def print_memo_if_changed(R: 'RULE', P: POS, indent: str):
global memo_str_last
s = f'{indent} MEMO({R.__name__}, {P}) <- {MEMO(R, P)}'
if s == memo_str_last.get((R, P)):
return
print(s)
memo_str_last[R, P] = s
# def EVAL(body: Body) -> AST: # [R.body]
# noinspection PyPep8Naming
def EVAL(R: 'RULE') -> AST:
"""
EVAL
"""
global BODY, Pos
pos_orig = Pos
indent = ' ' * n_stack()
print(f'\x1b[34m{indent}{R.__name__}: evaluate\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
ast = R()
if ast is FAIL:
assert Pos == pos_orig
print(f'\x1b[34m{indent}{R.__name__}: failed\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
return FAIL
pos2 = Pos + ast.leaf_len
print(f'\x1b[34m{indent}{R.__name__}: matched: {BODY[pos_orig:pos2]}\x1b[0m')
if Pos != pos2:
print(f'{indent} Pos:{Pos}->{pos2} BODY:{BODY[Pos:pos2]}>{BODY[pos2:]}')
print(f'{indent} ast:{ast}')
Pos = pos2
return ast
HEADS: dict[POS, t.Optional[HEAD]] = {} # HEADS
# noinspection PyPep8Naming
def RECALL(R: 'RULE', P: POS) -> t.Optional[MemoEntry]:
"""
Figure 9.
RECALL(R, P)
"""
indent = ' ' * n_stack()
m = MEMO(R, P)
h = HEADS.get(P)
print(f'{indent} RECALL: m:{m} h:{h}')
# If not growing a sed parse, just return what is stored
# in the memo table.
if h is None:
return m
# Do not evaluate any rule that is not involved in this
# left recursion.
# if m is None and R not in {h.head} | h.involvedSet # typo?
if m is None and R not in {h.rule} | h.involvedSet:
return MemoEntry(FAIL, P)
# Allow involved rules to be evaluated, but only once,
# during a seed-growing iteration.
if R in h.evalSet:
h.evalSet = h.evalSet - {R}
# ans = EVAL(R.body) # [R.body]
ans = EVAL(R)
m.ans = ans
m.pos = Pos
print_memo_if_changed(R, P, indent)
print_lr_stack_if_changed(indent)
return m
# noinspection PyPep8Naming
def LR_ANSWER(R: 'RULE', P: POS, M: MemoEntry) -> AST:
"""
Figure 8.
LR-ANSWER(R, P, M)
"""
indent = ' ' * n_stack()
h = M.ans.head
if h.rule is not R:
return M.ans.seed
else:
M.ans = M.ans.seed
print_memo_if_changed(R, P, indent)
print_lr_stack_if_changed(indent)
if M.ans is FAIL:
return FAIL
else:
return GROW_LR(R, P, M, h)
LRStack: LR = 'unknown_stack_bottom_value' # LRStack
LRStack_str_last = ''
def print_lr_stack_if_changed(indent: str):
global LRStack, LRStack_str_last
if LRStack == 'unknown_stack_bottom_value':
return
s = ''
s += f'{indent} LRStack:{LRStack}\n'
l = LRStack.next
while l != 'unknown_stack_bottom_value':
s+= f'{indent} {l}\n'
l = l.next
if s != LRStack_str_last:
print(s, end='')
LRStack_str_last = s
# noinspection PyPep8Naming
def SETUP_LR(R: 'RULE', L: LR):
"""
Figure 7.
SETUP-LR(R, L)
"""
global LRStack
if L.head is None:
L.head = HEAD(R, set(), set())
s = LRStack
# while s.head != L.head:
while s != 'unknown_stack_bottom_value' and s.head != L.head:
s.head = L.head
L.head.involvedSet = L.head.involvedSet | {s.rule}
print_lr_stack_if_changed(' ' * n_stack())
# noinspection PyPep8Naming
def APPLY_RULE(R: 'RULE', P: POS) -> AST:
"""
Figure 6.
APPLY-RULE(R, P)
"""
global LRStack, Pos
indent = ' ' * n_stack()
print(f'\x1b[34m{indent}{R.__name__}: APPLY-RULE\x1b[0m')
print(f'{indent} {Pos} {BODY[Pos:]}')
m = RECALL(R, P)
if m is None:
# Create a new LR and push it onto the rule
# invocation stack.
lr = LR(FAIL, R, None, LRStack)
LRStack = lr
# Memoize lr, then evaluate R.
m = MemoEntry(lr, P)
MEMO_set(R, P, m)
print_memo_if_changed(R, P, indent)
print_lr_stack_if_changed(indent)
# ans = EVAL(R.body) # [R.body]
ans = EVAL(R) # [R.body]
# Pop lr off the rule invocation stack.
LRStack = LRStack.next
print_memo_if_changed(R, P, indent)
print_lr_stack_if_changed(indent)
m.pos = Pos
if lr.head is not None:
lr.seed = ans
print_memo_if_changed(R, P, indent)
print_lr_stack_if_changed(indent)
return LR_ANSWER(R, P, m)
else:
m.ans = ans
print_memo_if_changed(R, P, indent)
print_lr_stack_if_changed(indent)
return ans
else:
print(f'\x1b[34m{indent}{R.__name__}: using memo\x1b[0m')
print(f'{indent} Pos:{Pos}->{m.pos} BODY:{BODY[Pos:m.pos]}>{BODY[m.pos:]}')
print(f'{indent} m:{m}')
Pos = m.pos
if isinstance(m.ans, LR):
print_memo_if_changed(R, P, indent)
print_lr_stack_if_changed(indent)
print(f'\x1b[34m{indent} SETUP-LR\x1b[0m')
SETUP_LR(R, m.ans)
print_memo_if_changed(R, P, indent)
print_lr_stack_if_changed(indent)
return m.ans.seed
else:
return m.ans
# noinspection PyPep8Naming
def APPLY_RULE_or_rollback_Pos(R: 'RULE', P: POS, pos_rollback_to: POS) -> AST:
global Pos
ast = APPLY_RULE(R, P)
if ast is FAIL:
Pos = pos_rollback_to
return ast
# noinspection PyPep8Naming
def GROW_LR(R: 'RULE', P: POS, M: MemoEntry, H: HEAD) -> AST:
"""
Figure 3.
GROW-LR
"""
global Pos
HEADS[P] = H # line A
while True:
Pos = P
H.evalSet = H.involvedSet.copy() # line B
# ans = EVAL(R.body) # [R.body]
ans = EVAL(R) # [R.body]
if ans is FAIL or Pos <= M.pos:
break
M.ans = ans
M.pos = Pos
HEADS[P] = None # line C
Pos = M.pos
return M.ans
# -----------------------------------------------------------------------------
# RULE
# The type is defined in the paper
RULE = t.Callable[[], AST]
rule_expr: RULE
rule_num: RULE
rule_plus: RULE
rule_minus: RULE
def rule_x() -> AST:
"""
x ::= <expr>
"""
global BODY, Pos
pos_orig = Pos
ast = APPLY_RULE_or_rollback_Pos(rule_expr, Pos, pos_orig)
if ast == FAIL:
return ast
return AST('x', BODY[pos_orig:Pos], 0, [ast], n_stack())
def rule_expr() -> AST:
"""
expr ::= <x> "-" <num> / <num>
"""
def rule1() -> AST:
"""<fact> "*" <num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_x, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
ast1 = APPLY_RULE_or_rollback_Pos(rule_minus, Pos, pos_orig)
if ast1 == FAIL:
return FAIL
ast2 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast2 == FAIL:
return FAIL
return AST('expr', BODY[pos_orig:Pos], 0, [ast0, ast1, ast2], n_stack())
def rule2() -> AST:
"""<fact> "/" <num>"""
global BODY, Pos
pos_orig = Pos
ast0 = APPLY_RULE_or_rollback_Pos(rule_num, Pos, pos_orig)
if ast0 == FAIL:
return FAIL
return AST('expr', BODY[pos_orig:Pos], 0, [ast0], n_stack())
global BODY, Pos
ast = rule1()
if ast != FAIL:
return ast
ast = rule2()
if ast != FAIL:
return ast
return FAIL
def rule_num() -> AST:
"""
<num>
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
match = re.search(r'^(\d+)', BODY[Pos:])
if match is None:
return FAIL
return AST('num', match[1], len(match[1]), [], n_stack())
def rule_plus() -> AST:
"""
"+"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '+':
return FAIL
return AST('"+"', '+', len('+'), [], n_stack())
def rule_minus() -> AST:
"""
"-"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '-':
return FAIL
return AST('"-"', '-', len('-'), [], n_stack())
def rule_mul() -> AST:
"""
"*"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '*':
return FAIL
return AST('"*"', '*', len('*'), [], n_stack())
def rule_div() -> AST:
"""
"/"
"""
global BODY, Pos
if Pos >= len(BODY):
return FAIL
if BODY[Pos] != '/':
return FAIL
return AST('"/"', '/', len('/'), [], n_stack())
# -----------------------------------------------------------------------------
# main
if __name__ == '__main__':
BODY = Body('4-3')
Pos = POS(0) # Pos
pos_orig = Pos
# > This is clearly not the behavior we wanted.
ast = APPLY_RULE_or_rollback_Pos(rule_x, Pos, pos_orig)
print()
print(ast)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment