-
-
Save tmiasko/7c31853276d94af57c18ca7522badd1c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import string | |
import pprint | |
class Node: | |
def __init__(self, name, nodes): | |
self.name = name | |
self.nodes = nodes | |
def print(self, pp): | |
pp.open(self.name) | |
for node in self.nodes: | |
node.print(pp) | |
pp.close() | |
class TextNode: | |
def __init__(self, text): | |
self.text = text | |
def print(self, pp): | |
pp.text(repr(self.text)) | |
class ErrorNode: | |
def __init__(self, message=None): | |
self.message = message | |
def print(self, pp): | |
pp.open('error') | |
if self.message is not None: | |
pp.text(self.message) | |
pp.close() | |
class PrettyPrinter: | |
def __init__(self): | |
self.s = '' | |
self.indent = 0 | |
def text(self, s): | |
if not self.s.endswith(' '): | |
self.s += ' ' | |
self.s += s | |
def open(self, name): | |
if self.s: | |
self.s += '\n' + self.indent * ' ' | |
self.indent += 1 | |
self.s += '(' + name | |
def close(self): | |
self.indent -= 1 | |
self.s += ')' | |
def print(self, node): | |
self.s = '' | |
self.indent = 0 | |
node.print(self) | |
print(self.s) | |
class Symbol: | |
def __init__(self, name): | |
self.name = name | |
self.productions = [] | |
self.chars = set() | |
def initialize_chars(self): | |
return self.chars | |
def parse(self, p): | |
return p.parse_symbol(self) | |
def __repr__(self): | |
return f'<{self.name}>' | |
class Seq: | |
def __init__(self, symbols): | |
self.symbols = symbols | |
self.chars = set() | |
def initialize_chars(self): | |
result = set() | |
empty = True | |
for symbol in self.symbols: | |
chars = symbol.initialize_chars() | |
if empty: | |
result |= chars | |
empty = None in chars | |
if empty: | |
result.add(None) | |
else: | |
result.discard(None) | |
self.chars = result | |
return self.chars | |
def parse(self, p): | |
result = [] | |
for symbol in self.symbols: | |
result.extend(symbol.parse(p)) | |
return result | |
def __repr__(self): | |
return ' '.join(repr(s) for s in self.symbols) | |
class Repeat: | |
def __init__(self, rule): | |
self.rule = rule | |
self.chars = {None} | |
def initialize_chars(self): | |
self.chars |= self.rule.initialize_chars() | |
return self.chars | |
def parse(self, p): | |
result = [] | |
while True: | |
c = p.peek() | |
if c is None or c not in self.rule.chars: | |
break | |
result.extend(self.rule.parse(p)) | |
return result | |
def __repr__(self): | |
return f'{{{self.rule!r}}}' | |
class Opt: | |
def __init__(self, rule): | |
self.rule = rule | |
self.chars = {None} | |
def initialize_chars(self): | |
self.chars |= self.rule.initialize_chars() | |
return self.chars | |
def parse(self, p): | |
c = p.peek() | |
if c is None or c not in self.rule.chars: | |
return [] | |
else: | |
return self.rule.parse(p) | |
def __repr__(self): | |
return f'[{self.rule!r}]' | |
class Terminal: | |
def __init__(self, name, chars): | |
self.name = name | |
self.chars = set(chars) | |
def initialize_chars(self): | |
return self.chars | |
def __repr__(self): | |
return self.name | |
class String(Terminal): | |
def __init__(self, s): | |
super().__init__(s, {s[0]}) | |
def parse(self, p): | |
return [p.parse_string(self.name)] | |
class Base62Number(Terminal): | |
def __init__(self): | |
super().__init__('base-62-number', string.ascii_lowercase + string.ascii_uppercase + string.digits + '_') | |
def parse(self, p): | |
return [p.parse_base62_number()] | |
class DecimalNumber(Terminal): | |
def __init__(self): | |
super().__init__('decimal-number', string.digits) | |
def parse(self, p): | |
return [p.parse_decimal_number()] | |
class HexNumber(Terminal): | |
def __init__(self): | |
super().__init__('hex-number', string.hexdigits.lower() + 'n_') | |
def parse(self, p): | |
return [p.parse_hex_number()] | |
class UndisambiguatedIdentifier(Terminal): | |
def __init__(self): | |
super().__init__('undisambiguated-identifier', string.digits + 'u') | |
def parse(self, p): | |
return [p.parse_undisambiguated_identifier()] | |
class Grammar: | |
def __init__(self, start): | |
self.symbols = {} | |
self.start = self.symbol(start) | |
def symbol(self, name): | |
if name not in self.symbols: | |
self.symbols[name] = Symbol(name) | |
return self.symbols[name] | |
def rule(self, lhs, *rhs): | |
symbols = [] | |
for s in rhs: | |
if isinstance(s, str): | |
s = String(s) | |
symbols.append(s) | |
lhs.productions.append(Seq(symbols)) | |
def finalize(self): | |
changed = True | |
while changed: | |
changed = False | |
for s in self.symbols.values(): | |
old_size = len(s.chars) | |
for rule in s.productions: | |
s.chars |= rule.initialize_chars() | |
new_size = len(s.chars) | |
changed |= old_size != new_size | |
return self | |
class Parser: | |
def __init__(self, input): | |
self.input = input | |
self.pos = 0 | |
self.err = False | |
def peek(self): | |
if not self.err and self.pos < len(self.input): | |
return self.input[self.pos] | |
def next(self): | |
assert not self.err | |
c = self.input[self.pos] | |
self.pos += 1 | |
return c | |
def parse_bytes(self, n): | |
if self.err: | |
return ErrorNode() | |
if self.pos + n <= len(self.input): | |
start = self.pos | |
self.pos += n | |
return TextNode(self.input[start:self.pos]) | |
self.err = True | |
return ErrorNode(f'expected {n} bytes') | |
def parse_if(self, prefix): | |
if not self.err and self.input.startswith(prefix, self.pos): | |
self.pos += len(prefix) | |
def parse_string(self, prefix): | |
if self.err: | |
return ErrorNode() | |
if self.input.startswith(prefix, self.pos): | |
self.pos += len(prefix) | |
return TextNode(prefix) | |
self.err = True | |
return ErrorNode(f'expected {prefix!r}') | |
def parse_hex_number(self): | |
# ["n"] {<0-9a-f>} "_" | |
if self.err: | |
return ErrorNode() | |
start = self.pos | |
self.parse_if('n') | |
while True: | |
c = self.peek() | |
if c is None: | |
self.err = True | |
return ErrorNode('unexpected eof') | |
elif c == '_': | |
self.next() | |
break | |
elif '0' <= c <= '9' or 'a' <= c <= 'f': | |
self.next() | |
else: | |
self.err = True | |
return ErrorNode(f'unexpected {c}') | |
return TextNode(self.input[start:self.pos]) | |
def parse_base62_number(self): | |
# {<0-9a-zA-Z} "_" | |
if self.err: | |
return ErrorNode() | |
start = self.pos | |
while True: | |
c = self.peek() | |
if c is None: | |
self.err = True | |
return ErrorNode('unexpected eof') | |
elif c == '_': | |
self.next() | |
break | |
elif '0' <= c <= '9' or 'a' <= c <= 'z' or 'A' <= c <= 'Z': | |
self.next() | |
else: | |
self.err = True | |
return ErrorNode(f'unexpected {c}') | |
return TextNode(self.input[start:self.pos]) | |
def parse_decimal_number(self): | |
# {<0-9>} without insignificant leading zeros | |
if self.err: | |
return ErrorNode() | |
start = self.pos | |
c = self.peek() | |
if c == '0': | |
self.next() | |
return TextNode(self.input[start:self.pos]) | |
while True: | |
c = self.peek() | |
if '0' <= c <= '9': | |
self.next() | |
else: | |
break | |
return TextNode(self.input[start:self.pos]) | |
def parse_undisambiguated_identifier(self): | |
start = self.pos | |
self.parse_if('u') | |
n = self.parse_decimal_number() | |
if self.err: | |
return ErrorNode() | |
n = int(n.text) | |
self.parse_if('_') | |
n = self.parse_bytes(n) | |
if self.err: | |
return n | |
ident = self.input[start:self.pos] | |
'a-z' 'A-Z' '_' | |
for c in ident: | |
if '0' <= c <= '9' or 'a' <= c <= 'z' or 'A' <= c <= 'Z' or c == '_': | |
continue | |
self.err = True | |
return ErrorNode(f'invalid identifier {ident}') | |
return TextNode(ident) | |
def parse_symbol(self, symbol): | |
production = None | |
for p in symbol.productions: | |
if self.peek() in p.chars: | |
production = p | |
break | |
if production is None: | |
self.err = True | |
return [ErrorNode(f'<{symbol.name}>')] | |
else: | |
return [Node(symbol.name, production.parse(self))] | |
@staticmethod | |
def parse(symbol, input): | |
p = Parser(input) | |
result = p.parse_symbol(symbol) | |
remaining = p.input[p.pos:] | |
if remaining: | |
result.append(ErrorNode(f'unexpected {remaining!r}')) | |
return result | |
def __repr__(self): | |
parsed = self.input[:self.pos] | |
remaining = self.input[self.pos:] | |
return f'Parser({parsed!r} {remaining!r})' | |
def rust_symbol_grammar(): | |
g = Grammar('symbol-name') | |
abi = g.symbol('abi') | |
backref = g.symbol('backref') | |
basic_type = g.symbol('basic-type') | |
binder = g.symbol('binder') | |
const = g.symbol('const') | |
const_data = g.symbol('const-data') | |
disambiguator = g.symbol('disambiguator') | |
dyn_bounds = g.symbol('dyn-bounds') | |
dyn_trait = g.symbol('dyn-trait') | |
dyn_trait_assoc_binding = g.symbol('dyn-trait-assoc-binding') | |
fn_sig = g.symbol('fn-sig') | |
generic_arg = g.symbol('generic-arg') | |
identifier = g.symbol('identifier') | |
impl_path = g.symbol('impl-path') | |
instantiating_crate = g.symbol('instantiating-crate') | |
lifetime = g.symbol('lifetime') | |
namespace = g.symbol('namespace') | |
path = g.symbol('path') | |
symbol_name = g.symbol('symbol-name') | |
type = g.symbol('type') | |
g.rule(symbol_name, '_R', Opt(DecimalNumber()), path, Opt(instantiating_crate)) | |
g.rule(path, 'C', identifier) | |
g.rule(path, 'M', impl_path, type) | |
g.rule(path, 'X', impl_path, type, path) | |
g.rule(path, 'Y', type, path) | |
g.rule(path, 'N', namespace, path, identifier) | |
g.rule(path, 'I', path, Repeat(generic_arg), 'E') | |
g.rule(path, backref) | |
g.rule(impl_path, Opt(disambiguator), path) | |
g.rule(identifier, Opt(disambiguator), UndisambiguatedIdentifier()) | |
g.rule(disambiguator, 's', Base62Number()) | |
for c in string.ascii_lowercase + string.ascii_uppercase: | |
g.rule(namespace, c) | |
g.rule(generic_arg, lifetime) | |
g.rule(generic_arg, type) | |
g.rule(generic_arg, 'K', const) | |
g.rule(lifetime, 'L', Base62Number()) | |
g.rule(binder, 'G', Base62Number()) | |
g.rule(type, basic_type) | |
g.rule(type, path) | |
g.rule(type, 'A', type, const) | |
g.rule(type, 'S', type) | |
g.rule(type, 'T', Repeat(type), 'E') | |
g.rule(type, 'R', Opt(lifetime), type) | |
g.rule(type, 'Q', Opt(lifetime), type) | |
g.rule(type, 'P', type) | |
g.rule(type, 'O', type) | |
g.rule(type, 'F', fn_sig) | |
g.rule(type, 'D', dyn_bounds, lifetime) | |
g.rule(type, backref) | |
g.rule(basic_type, 'a') | |
g.rule(basic_type, 'b') | |
g.rule(basic_type, 'c') | |
g.rule(basic_type, 'd') | |
g.rule(basic_type, 'e') | |
g.rule(basic_type, 'f') | |
g.rule(basic_type, 'h') | |
g.rule(basic_type, 'i') | |
g.rule(basic_type, 'j') | |
g.rule(basic_type, 'l') | |
g.rule(basic_type, 'm') | |
g.rule(basic_type, 'n') | |
g.rule(basic_type, 'o') | |
g.rule(basic_type, 's') | |
g.rule(basic_type, 't') | |
g.rule(basic_type, 'u') | |
g.rule(basic_type, 'v') | |
g.rule(basic_type, 'x') | |
g.rule(basic_type, 'y') | |
g.rule(basic_type, 'z') | |
g.rule(basic_type, 'p') | |
g.rule(fn_sig, Opt(binder), Opt(String('U')), Opt(Seq([String('K'), abi])), Repeat(type), 'E', type) | |
g.rule(abi, 'C') | |
g.rule(abi, UndisambiguatedIdentifier()) | |
g.rule(dyn_bounds, Opt(binder), Repeat(dyn_trait), 'E') | |
g.rule(dyn_trait, path, Repeat(dyn_trait_assoc_binding)) | |
g.rule(dyn_trait_assoc_binding, 'p', UndisambiguatedIdentifier(), type) | |
g.rule(const, 'p') | |
g.rule(const, backref) | |
g.rule(const, type, const_data) | |
g.rule(const_data, HexNumber()) | |
g.rule(backref, 'B', Base62Number()) | |
g.rule(instantiating_crate, path) | |
return g.finalize() | |
def parse(grammar, word): | |
pp = PrettyPrinter() | |
for node in Parser.parse(grammar.start, word): | |
pp.print(node) | |
def main(): | |
grammar = rust_symbol_grammar() | |
if len(sys.argv) > 1: | |
for word in sys.argv[1:]: | |
parse(grammar, word) | |
else: | |
for line in sys.stdin: | |
for word in line.split(): | |
parse(grammar, word) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment