Skip to content

Instantly share code, notes, and snippets.

@tmiasko
Created May 2, 2021 15:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tmiasko/7c31853276d94af57c18ca7522badd1c to your computer and use it in GitHub Desktop.
Save tmiasko/7c31853276d94af57c18ca7522badd1c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sys
import string
import pprint
class Node:
def __init__(self, name, nodes):
self.name = name
self.nodes = nodes
def print(self, pp):
pp.open(self.name)
for node in self.nodes:
node.print(pp)
pp.close()
class TextNode:
def __init__(self, text):
self.text = text
def print(self, pp):
pp.text(repr(self.text))
class ErrorNode:
def __init__(self, message=None):
self.message = message
def print(self, pp):
pp.open('error')
if self.message is not None:
pp.text(self.message)
pp.close()
class PrettyPrinter:
def __init__(self):
self.s = ''
self.indent = 0
def text(self, s):
if not self.s.endswith(' '):
self.s += ' '
self.s += s
def open(self, name):
if self.s:
self.s += '\n' + self.indent * ' '
self.indent += 1
self.s += '(' + name
def close(self):
self.indent -= 1
self.s += ')'
def print(self, node):
self.s = ''
self.indent = 0
node.print(self)
print(self.s)
class Symbol:
def __init__(self, name):
self.name = name
self.productions = []
self.chars = set()
def initialize_chars(self):
return self.chars
def parse(self, p):
return p.parse_symbol(self)
def __repr__(self):
return f'<{self.name}>'
class Seq:
def __init__(self, symbols):
self.symbols = symbols
self.chars = set()
def initialize_chars(self):
result = set()
empty = True
for symbol in self.symbols:
chars = symbol.initialize_chars()
if empty:
result |= chars
empty = None in chars
if empty:
result.add(None)
else:
result.discard(None)
self.chars = result
return self.chars
def parse(self, p):
result = []
for symbol in self.symbols:
result.extend(symbol.parse(p))
return result
def __repr__(self):
return ' '.join(repr(s) for s in self.symbols)
class Repeat:
def __init__(self, rule):
self.rule = rule
self.chars = {None}
def initialize_chars(self):
self.chars |= self.rule.initialize_chars()
return self.chars
def parse(self, p):
result = []
while True:
c = p.peek()
if c is None or c not in self.rule.chars:
break
result.extend(self.rule.parse(p))
return result
def __repr__(self):
return f'{{{self.rule!r}}}'
class Opt:
def __init__(self, rule):
self.rule = rule
self.chars = {None}
def initialize_chars(self):
self.chars |= self.rule.initialize_chars()
return self.chars
def parse(self, p):
c = p.peek()
if c is None or c not in self.rule.chars:
return []
else:
return self.rule.parse(p)
def __repr__(self):
return f'[{self.rule!r}]'
class Terminal:
def __init__(self, name, chars):
self.name = name
self.chars = set(chars)
def initialize_chars(self):
return self.chars
def __repr__(self):
return self.name
class String(Terminal):
def __init__(self, s):
super().__init__(s, {s[0]})
def parse(self, p):
return [p.parse_string(self.name)]
class Base62Number(Terminal):
def __init__(self):
super().__init__('base-62-number', string.ascii_lowercase + string.ascii_uppercase + string.digits + '_')
def parse(self, p):
return [p.parse_base62_number()]
class DecimalNumber(Terminal):
def __init__(self):
super().__init__('decimal-number', string.digits)
def parse(self, p):
return [p.parse_decimal_number()]
class HexNumber(Terminal):
def __init__(self):
super().__init__('hex-number', string.hexdigits.lower() + 'n_')
def parse(self, p):
return [p.parse_hex_number()]
class UndisambiguatedIdentifier(Terminal):
def __init__(self):
super().__init__('undisambiguated-identifier', string.digits + 'u')
def parse(self, p):
return [p.parse_undisambiguated_identifier()]
class Grammar:
def __init__(self, start):
self.symbols = {}
self.start = self.symbol(start)
def symbol(self, name):
if name not in self.symbols:
self.symbols[name] = Symbol(name)
return self.symbols[name]
def rule(self, lhs, *rhs):
symbols = []
for s in rhs:
if isinstance(s, str):
s = String(s)
symbols.append(s)
lhs.productions.append(Seq(symbols))
def finalize(self):
changed = True
while changed:
changed = False
for s in self.symbols.values():
old_size = len(s.chars)
for rule in s.productions:
s.chars |= rule.initialize_chars()
new_size = len(s.chars)
changed |= old_size != new_size
return self
class Parser:
def __init__(self, input):
self.input = input
self.pos = 0
self.err = False
def peek(self):
if not self.err and self.pos < len(self.input):
return self.input[self.pos]
def next(self):
assert not self.err
c = self.input[self.pos]
self.pos += 1
return c
def parse_bytes(self, n):
if self.err:
return ErrorNode()
if self.pos + n <= len(self.input):
start = self.pos
self.pos += n
return TextNode(self.input[start:self.pos])
self.err = True
return ErrorNode(f'expected {n} bytes')
def parse_if(self, prefix):
if not self.err and self.input.startswith(prefix, self.pos):
self.pos += len(prefix)
def parse_string(self, prefix):
if self.err:
return ErrorNode()
if self.input.startswith(prefix, self.pos):
self.pos += len(prefix)
return TextNode(prefix)
self.err = True
return ErrorNode(f'expected {prefix!r}')
def parse_hex_number(self):
# ["n"] {<0-9a-f>} "_"
if self.err:
return ErrorNode()
start = self.pos
self.parse_if('n')
while True:
c = self.peek()
if c is None:
self.err = True
return ErrorNode('unexpected eof')
elif c == '_':
self.next()
break
elif '0' <= c <= '9' or 'a' <= c <= 'f':
self.next()
else:
self.err = True
return ErrorNode(f'unexpected {c}')
return TextNode(self.input[start:self.pos])
def parse_base62_number(self):
# {<0-9a-zA-Z} "_"
if self.err:
return ErrorNode()
start = self.pos
while True:
c = self.peek()
if c is None:
self.err = True
return ErrorNode('unexpected eof')
elif c == '_':
self.next()
break
elif '0' <= c <= '9' or 'a' <= c <= 'z' or 'A' <= c <= 'Z':
self.next()
else:
self.err = True
return ErrorNode(f'unexpected {c}')
return TextNode(self.input[start:self.pos])
def parse_decimal_number(self):
# {<0-9>} without insignificant leading zeros
if self.err:
return ErrorNode()
start = self.pos
c = self.peek()
if c == '0':
self.next()
return TextNode(self.input[start:self.pos])
while True:
c = self.peek()
if '0' <= c <= '9':
self.next()
else:
break
return TextNode(self.input[start:self.pos])
def parse_undisambiguated_identifier(self):
start = self.pos
self.parse_if('u')
n = self.parse_decimal_number()
if self.err:
return ErrorNode()
n = int(n.text)
self.parse_if('_')
n = self.parse_bytes(n)
if self.err:
return n
ident = self.input[start:self.pos]
'a-z' 'A-Z' '_'
for c in ident:
if '0' <= c <= '9' or 'a' <= c <= 'z' or 'A' <= c <= 'Z' or c == '_':
continue
self.err = True
return ErrorNode(f'invalid identifier {ident}')
return TextNode(ident)
def parse_symbol(self, symbol):
production = None
for p in symbol.productions:
if self.peek() in p.chars:
production = p
break
if production is None:
self.err = True
return [ErrorNode(f'<{symbol.name}>')]
else:
return [Node(symbol.name, production.parse(self))]
@staticmethod
def parse(symbol, input):
p = Parser(input)
result = p.parse_symbol(symbol)
remaining = p.input[p.pos:]
if remaining:
result.append(ErrorNode(f'unexpected {remaining!r}'))
return result
def __repr__(self):
parsed = self.input[:self.pos]
remaining = self.input[self.pos:]
return f'Parser({parsed!r} {remaining!r})'
def rust_symbol_grammar():
g = Grammar('symbol-name')
abi = g.symbol('abi')
backref = g.symbol('backref')
basic_type = g.symbol('basic-type')
binder = g.symbol('binder')
const = g.symbol('const')
const_data = g.symbol('const-data')
disambiguator = g.symbol('disambiguator')
dyn_bounds = g.symbol('dyn-bounds')
dyn_trait = g.symbol('dyn-trait')
dyn_trait_assoc_binding = g.symbol('dyn-trait-assoc-binding')
fn_sig = g.symbol('fn-sig')
generic_arg = g.symbol('generic-arg')
identifier = g.symbol('identifier')
impl_path = g.symbol('impl-path')
instantiating_crate = g.symbol('instantiating-crate')
lifetime = g.symbol('lifetime')
namespace = g.symbol('namespace')
path = g.symbol('path')
symbol_name = g.symbol('symbol-name')
type = g.symbol('type')
g.rule(symbol_name, '_R', Opt(DecimalNumber()), path, Opt(instantiating_crate))
g.rule(path, 'C', identifier)
g.rule(path, 'M', impl_path, type)
g.rule(path, 'X', impl_path, type, path)
g.rule(path, 'Y', type, path)
g.rule(path, 'N', namespace, path, identifier)
g.rule(path, 'I', path, Repeat(generic_arg), 'E')
g.rule(path, backref)
g.rule(impl_path, Opt(disambiguator), path)
g.rule(identifier, Opt(disambiguator), UndisambiguatedIdentifier())
g.rule(disambiguator, 's', Base62Number())
for c in string.ascii_lowercase + string.ascii_uppercase:
g.rule(namespace, c)
g.rule(generic_arg, lifetime)
g.rule(generic_arg, type)
g.rule(generic_arg, 'K', const)
g.rule(lifetime, 'L', Base62Number())
g.rule(binder, 'G', Base62Number())
g.rule(type, basic_type)
g.rule(type, path)
g.rule(type, 'A', type, const)
g.rule(type, 'S', type)
g.rule(type, 'T', Repeat(type), 'E')
g.rule(type, 'R', Opt(lifetime), type)
g.rule(type, 'Q', Opt(lifetime), type)
g.rule(type, 'P', type)
g.rule(type, 'O', type)
g.rule(type, 'F', fn_sig)
g.rule(type, 'D', dyn_bounds, lifetime)
g.rule(type, backref)
g.rule(basic_type, 'a')
g.rule(basic_type, 'b')
g.rule(basic_type, 'c')
g.rule(basic_type, 'd')
g.rule(basic_type, 'e')
g.rule(basic_type, 'f')
g.rule(basic_type, 'h')
g.rule(basic_type, 'i')
g.rule(basic_type, 'j')
g.rule(basic_type, 'l')
g.rule(basic_type, 'm')
g.rule(basic_type, 'n')
g.rule(basic_type, 'o')
g.rule(basic_type, 's')
g.rule(basic_type, 't')
g.rule(basic_type, 'u')
g.rule(basic_type, 'v')
g.rule(basic_type, 'x')
g.rule(basic_type, 'y')
g.rule(basic_type, 'z')
g.rule(basic_type, 'p')
g.rule(fn_sig, Opt(binder), Opt(String('U')), Opt(Seq([String('K'), abi])), Repeat(type), 'E', type)
g.rule(abi, 'C')
g.rule(abi, UndisambiguatedIdentifier())
g.rule(dyn_bounds, Opt(binder), Repeat(dyn_trait), 'E')
g.rule(dyn_trait, path, Repeat(dyn_trait_assoc_binding))
g.rule(dyn_trait_assoc_binding, 'p', UndisambiguatedIdentifier(), type)
g.rule(const, 'p')
g.rule(const, backref)
g.rule(const, type, const_data)
g.rule(const_data, HexNumber())
g.rule(backref, 'B', Base62Number())
g.rule(instantiating_crate, path)
return g.finalize()
def parse(grammar, word):
pp = PrettyPrinter()
for node in Parser.parse(grammar.start, word):
pp.print(node)
def main():
grammar = rust_symbol_grammar()
if len(sys.argv) > 1:
for word in sys.argv[1:]:
parse(grammar, word)
else:
for line in sys.stdin:
for word in line.split():
parse(grammar, word)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment