Skip to content

Instantly share code, notes, and snippets.

@phantamanta44
Created January 31, 2022 01:24
Show Gist options
  • Save phantamanta44/8b5530744476d5be7c5edc8b66380113 to your computer and use it in GitHub Desktop.
Save phantamanta44/8b5530744476d5be7c5edc8b66380113 to your computer and use it in GitHub Desktop.
wynnatlas expression parser generator
#!/usr/bin/env python3
from abc import ABC
from collections import defaultdict
import itertools
import sys
from typing import Any, Iterable, Iterator, Optional, cast
class Word(ABC):
pass
class WordNonterm(Word):
def __init__(self, nonterm_name: str):
self.nonterm_name = nonterm_name
def __eq__(self, o: Any) -> bool:
return isinstance(o, WordNonterm) and self.nonterm_name == o.nonterm_name
def __hash__(self) -> int:
return hash(self.nonterm_name) * 31
def __repr__(self) -> str:
return self.nonterm_name
class WordTerm(Word):
def __init__(self, term_name: str):
self.term_name = term_name
self.str_rep = term_name if term_name.isidentifier() else f'"{term_name}"'
def __eq__(self, o: Any) -> bool:
return isinstance(o, WordTerm) and self.term_name == o.term_name
def __hash__(self) -> int:
return hash(self.term_name) * 13
def __repr__(self) -> str:
return self.str_rep
class WordEps(WordTerm):
def __init__(self):
super(WordEps, self).__init__('')
def __eq__(self, o: Any) -> bool:
return isinstance(o, WordEps)
def __hash__(self) -> int:
return 0xDEADBEEF
def __repr__(self) -> str:
return 'ε'
EPS = WordEps()
def main():
gram_lines: list[str]
with open(sys.argv[1], 'r', encoding='UTF-8') as gram_file:
gram_lines = gram_file.readlines()
nonterm_strs: list[str] = []
nonterm_buf: list[str] = []
def flush_buf():
nonlocal nonterm_buf
nonterm_str = '\n'.join(nonterm_buf)
if len(nonterm_str) > 0:
nonterm_strs.append(nonterm_str)
nonterm_buf = []
for line in gram_lines:
line = line.strip()
if len(line) > 0:
nonterm_buf.append(line)
else:
flush_buf()
flush_buf()
nonterms: dict[str, list[list[Word]]] = {}
terms: set[WordTerm] = {EPS}
for nonterm_str in nonterm_strs:
nonterm_part_strs = nonterm_str.split(':=', maxsplit=1)
if len(nonterm_part_strs) != 2:
continue
prods: list[list[Word]] = []
for prod_str in nonterm_part_strs[1].split('| '):
prod: list[Word] = []
for term_str in prod_str.split():
if term_str == 'ε':
prod.append(EPS)
elif term_str.startswith('\"'):
word = WordTerm(term_str[1:-1])
prod.append(word)
terms.add(word)
else:
prod.append(WordNonterm(term_str))
prods.append(prod)
nonterms[nonterm_part_strs[0].strip()] = prods
print('### PRODUCTIONS ###')
for nonterm_name, prods in nonterms.items():
print(f'{nonterm_name} := {" ".join(repr(word) for word in prods[0])}')
indent = ' ' * (len(nonterm_name) + 2)
for prod in prods[1:]:
print(f'{indent}| {" ".join(repr(word) for word in prod)}')
print()
first_tbl: defaultdict[Word, set[WordTerm]] = defaultdict(set)
for term in terms:
first_tbl[term] = {term}
while True:
changed = False
for nonterm_name, prods in nonterms.items():
nonterm_word = WordNonterm(nonterm_name)
first_set = first_tbl[nonterm_word]
pre_len = len(first_set)
for prod in prods:
for prod_word in prod:
seen_eps = False
for fst_word in first_tbl[prod_word]:
if fst_word == EPS:
seen_eps = True
else:
first_set.add(fst_word)
if not seen_eps:
break
else:
first_set.add(EPS)
if len(first_set) != pre_len:
changed = True
if not changed:
break
print('### FIRST ###')
just_len = max(len(nonterm_name) for nonterm_name in nonterms.keys()) + 2
for nonterm_name in nonterms.keys():
print(f'{f"`{nonterm_name}`".ljust(just_len)} | '\
f'{", ".join(sorted(f"`{repr(word)}`" for word in first_tbl[WordNonterm(nonterm_name)]))}')
print()
def get_seq_fst(word_seq: Iterable[Word]) -> set[WordTerm]:
first_set: set[WordTerm] = set()
for word in word_seq:
seen_eps = False
for fst_word in first_tbl[word]:
if fst_word == EPS:
seen_eps = True
else:
first_set.add(fst_word)
if not seen_eps:
break
else:
first_set.add(EPS)
return first_set
follow_tbl: defaultdict[str, set[WordTerm]] = defaultdict(set)
while True:
changed = False
for nonterm_name, prods in nonterms.items():
for prod in prods:
for prod_word_ndx in range(len(prod)):
prod_word = prod[prod_word_ndx]
if isinstance(prod_word, WordNonterm):
follow_set = follow_tbl[prod_word.nonterm_name]
pre_len = len(follow_set)
seen_eps = False
following_fst = get_seq_fst(prod[prod_word_ndx + 1:])
for fst_word in following_fst:
if fst_word == EPS:
seen_eps = True
else:
follow_set.add(fst_word)
if seen_eps or len(following_fst) == 0:
for fol_word in follow_tbl[nonterm_name]:
follow_set.add(fol_word)
if len(follow_set) != pre_len:
changed = True
if not changed:
break
print('### FOLLOW ###')
for nonterm_name in nonterms.keys():
print(f'{f"`{nonterm_name}`".ljust(just_len)} | '\
f'{", ".join(sorted(f"`{repr(word)}`" for word in follow_tbl[nonterm_name]))}')
print()
parse_tbl: defaultdict[tuple[str, WordTerm], Optional[int]] = defaultdict(lambda: None)
def put_parse_tbl_entry(nonterm_name: str, term: WordTerm, prod_ndx: int):
if parse_tbl[(nonterm_name, term)] is not None:
raise ValueError(f'Parse table conflict: {nonterm_name} x {term} -> {parse_tbl[(nonterm_name, term)]} AND {prod_ndx}')
else:
parse_tbl[(nonterm_name, term)] = prod_ndx
for nonterm_name, prods in nonterms.items():
for prod_ndx in range(len(prods)):
prod_fst = get_seq_fst(prods[prod_ndx])
for term in prod_fst:
put_parse_tbl_entry(nonterm_name, term, prod_ndx)
if EPS in prod_fst:
for term in follow_tbl[nonterm_name]:
put_parse_tbl_entry(nonterm_name, term, prod_ndx)
term_list = sorted(terms, key=lambda t: repr(t))
print('### Parse Table ###')
print(f'Nonterminal | {" | ".join(repr(term) for term in term_list)}')
print(f'----------- | {" | ".join("-" * len(repr(term)) for term in term_list)}')
for nonterm_name in nonterms.keys():
row: list[str] = []
for term in term_list:
prod_ndx = parse_tbl[(nonterm_name, term)]
row.append((str(prod_ndx) if prod_ndx is not None else '-').ljust(len(repr(term))))
print(f'{nonterm_name.ljust(11)} | {" | ".join(row)}')
print()
san_nonterm_name = { nonterm_name: (nonterm_name[0].upper() + nonterm_name[1:]).replace('\'', '0') for nonterm_name in nonterms.keys() }
for nonterm_name, prods in nonterms.items():
safe_nonterm_name = nonterm_name.replace('\'', '\\\'')
print(f'function take{san_nonterm_name[nonterm_name]}(tokens) {{')
print('const children = [];')
print('switch (tokens.peek.type) {')
term_prods = cast(Iterator[tuple[WordTerm, int]],\
filter(lambda n: n[1] is not None, ((term, parse_tbl[(nonterm_name, term)]) for term in term_list)))
for prod_ndx, prod_terms in itertools.groupby(sorted(term_prods, key=lambda n: n[1]), lambda n: n[1]):
for term, _ in prod_terms:
print(f'case \'{"eof" if term == EPS else term.term_name}\':')
for word in prods[cast(int, prod_ndx)]:
if isinstance(word, WordNonterm):
print(f'children.push(take{san_nonterm_name[word.nonterm_name]}(tokens));')
elif isinstance(word, WordTerm) and not isinstance(word, WordEps):
print(f'children.push(tokens.consume(\'{word.term_name}\'));')
print(f'return {{ type: \'nonterm\', name: \'{safe_nonterm_name}\', prod: {prod_ndx}, children }};')
print('default:')
print(f'throw new Error(\'Could not parse {safe_nonterm_name}!\');')
print('}')
print('}')
if __name__ == '__main__':
main()
expr := conj
exprList := expr exprList'
exprList' := "," expr exprList'
| ε
conj := disj conj'
conj' := "&" disj conj'
| ε
disj := cmpEq disj'
disj' := "|" cmpEq disj'
| ε
cmpEq := cmpRel cmpEq'
cmpEq' := "=" cmpRel cmpEq'
| "!=" cmpRel cmpEq'
| "?=" cmpRel cmpEq'
| ε
cmpRel := sum cmpRel'
cmpRel' := "<=" sum cmpRel'
| "<" sum cmpRel'
| ">" sum cmpRel'
| ">=" sum cmpRel'
| ε
sum := prod sum'
sum' := "+" prod sum'
| "-" prod sum'
| ε
prod := exp prod'
prod' := "*" exp prod'
| "/" exp prod'
| ε
exp := unary exp'
exp' := "^" unary exp'
| ε
unary := "-" unary
| "!" unary
| prim
prim := "nLit"
| "bLit"
| "sLit"
| "ident" identTail
| "(" expr ")"
identTail := "(" args ")"
| ε
args := exprList
| ε
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment