Skip to content

Instantly share code, notes, and snippets.

@caudamus
Last active February 28, 2024 13:30
Show Gist options
  • Save caudamus/1294f197cbb3fc16c1a82e7e2f1d0ec4 to your computer and use it in GitHub Desktop.
Save caudamus/1294f197cbb3fc16c1a82e7e2f1d0ec4 to your computer and use it in GitHub Desktop.
BNF Parser

BNF parsing

Toy weekend project: make a parser-parser!

This takes in a grammar given in Bakus-Naur Form (BNF), and a program that was written in the provided grammar, and produces an image of Syntax Tree using graphviz.

References:

Examples

Simple arithmetic grammar from chapter 2 of the dragon book.

  • simple.bnf - simple arithmetic grammar
  • simple.txt - example arithmetic
  • simple.png - generated with ./bnf.py simple.bnf simple.txt simple

Grammar for the grammar definitions themselves, which is both grammar and program.

  • bnf.bnf - grammar for BNF grammars
  • bnf.png - generated with ./bnf.py bnf.bnf bnf.bnf bnf

Toy Language (TL) from University of Texas at San Antonio.

  • tl.bnf - grammar for TL
  • example.tl - example program in TL
  • tl.png - generated with ./bnf.py tl.bnf example.tl tl
grammar = rule grammar | ;
rule = ruleName "=" expression ";" ;
expression = list "|" expression | list | ;
list = term list | ;
term = literal | ruleName ;
ruleName = identifier ;
#!/usr/bin/env python
import re
import argparse
from collections import namedtuple
from graphviz import Digraph
# Utility types to keep things a little bit neater
Token = namedtuple('Token', ['token', 'value'])
Rule = namedtuple('Rule', ['name', 'expression'])
Grammar = namedtuple('Grammar', ['root', 'rules'])
Tree = namedtuple('Tree', ['root', 'children', 'length'])
terminals = ['identifier', 'integer', 'symbol', 'literal']
def tokenize(s):
''' Return a list of tokens from a string '''
tokens = []
offset = 0 # Offset into string
s = re.sub(r'%[^\n]*\n', '', s) # Remove comments
while offset < len(s):
# Newlines and spaces (ignored)
if s[offset] == '\n' or s[offset] == ' ':
offset = offset + 1
continue
# Identifiers
match = re.match(r'^[a-z_A-Z][a-zA-Z0-9]*', s[offset:])
if match is not None:
tokens.append(Token('identifier', match.group()))
offset = offset + len(match.group())
continue
# Integers
match = re.match(r'^[1-9][0-9]*|0', s[offset:])
if match is not None:
tokens.append(Token('integer', match.group()))
offset = offset + len(match.group())
continue
# Symbols
match = re.match(r'^[=|;:!<>\+\-\*\/\(\)]+', s[offset:])
if match is not None:
tokens.append(Token('symbol', match.group()))
offset = offset + len(match.group())
continue
# Literals
match = re.match(r'^"([^"\n]*)"', s[offset:])
if match is not None:
tokens.append(Token('literal', match.groups()[0]))
offset = offset + len(match.group())
continue
# Error case
raise ValueError('Unable to tokenize: {}'.format(s[offset:]))
return tokens
def is_term(token, symbol):
''' Return true if token is the given terminal symbol '''
return token.token == 'symbol' and token.value == symbol
def parse_grammar(tokens):
''' Parse a list of tokens into a BNF grammar '''
rules = {}
total = 0
first_rule = None
while total < len(tokens):
# Parse rule name
assert tokens[total].token == 'identifier', '{} {}'.format(
tokens[total], total)
name = tokens[total].value
if first_rule is None:
first_rule = name
total += 1
# Parse assignment symbol
assert is_term(tokens[total], '=')
total += 1
# Parse expression
expression = []
while True:
# Parse list
list_ = []
while not (is_term(tokens[total], '|') or is_term(tokens[total], ';')):
list_.append(tokens[total].value)
total += 1
expression.append(list_)
if is_term(tokens[total], ';'):
break
if is_term(tokens[total], '|'):
total += 1
# Parse terminating semicolon
assert is_term(tokens[total], ';')
total += 1
# Add rule to list
rules[name] = expression
return Grammar(first_rule, rules)
def parse(grammar, tokens, items=None):
''' Given a grammar, parse a list of tokens into syntax trees given nonterminals '''
if items is None:
items = [grammar.root]
if len(items) == 0:
return []
if len(tokens) == 0:
return None
item, *tail = items
if item not in grammar.rules:
if ((item in terminals and tokens[0].token == item) or
(item == tokens[0].value and tokens[0].token in terminals)):
tail_trees = parse(grammar, tokens[1:], tail)
if tail_trees is not None:
root = item
if item == tokens[0].token and tokens[0].token != 'literal':
root = root + '\n' + tokens[0].value
return [Tree(root, [], 1)] + tail_trees
return None
for exp in grammar.rules[item]:
exp_trees = parse(grammar, tokens, exp)
if exp_trees is None:
continue
offset = sum(tree.length for tree in exp_trees)
tail_trees = parse(grammar, tokens[offset:], tail)
if tail_trees is None:
continue
return [Tree(item, exp_trees, offset)] + tail_trees
return None
def render(tree, filepath):
''' Draw the syntax tree as a graph to an output image file '''
dot = Digraph(format='png')
to_visit = [tree]
while to_visit:
node = to_visit.pop(0)
dot.node(str(id(node)), label=str(node.root))
for child in node.children:
if isinstance(child, Tree):
dot.edge(str(id(node)), str(id(child)))
to_visit.append(child)
dot.render(filepath, cleanup=True)
def main():
parser = argparse.ArgumentParser(description='Make syntax tree graphs')
parser.add_argument('grammar_file', help='BNF-encoded grammar to parse')
parser.add_argument('program_file', help='program to parse with grammar')
parser.add_argument('output_image', help='image file to save syntax tree')
args = parser.parse_args()
grammar = parse_grammar(tokenize(open(args.grammar_file).read()))
program = tokenize(open(args.program_file).read())
syntax, = parse(grammar, program)
render(syntax, args.output_image)
if __name__ == '__main__':
main()
program
var N as int ;
var SQRT as int ;
begin
N := readInt ;
SQRT := 0 ;
% go until SQRT exceeds the square root of N
while SQRT * SQRT <= N do
SQRT := SQRT + 1 ;
end ;
SQRT := SQRT - 1 ; % subtract one SQRT is <= sqrt(N)
writeInt SQRT ;
end
expr = term "+" expr | term "-" expr | term ;
term = factor "*" term | factor "/" term | factor ;
factor = integer | "(" expr ")" ;
9 / 3 + 5 * (2 - 0)
programText = "program" declarations "begin" statementSequence "end" ;
declarations = "var" identifier "as" type ";" declarations | ;
type = "int" | "bool" ;
statementSequence = statement ";" statementSequence | ;
statement = assignment | ifStatement | whileStatement | writeIntExpr ;
assignment = identifier ":=" "readInt" | identifier ":=" expression ;
ifStatement = "if" expression "then" statementSequence elseClause "end" ;
elseClause = "else" statementSequence | ;
whileStatement = "while" expression "do" statementSequence "end" ;
writeIntExpr = "writeInt" expression ;
expression = simpleExpression | simpleExpression compare expression ;
simpleExpression = term additive simpleExpression | term ;
term = factor multiplicative term | factor ;
factor = identifier | integer | boolean | "(" expression ")" ;
compare = "=" | "!=" | "<" | ">" | "<=" | ">=" ;
additive = "+" | "-" ;
multiplicative = "*" | "div" | "mod" ;
boolean = "true" | "false" ;
@anhldbk
Copy link

anhldbk commented Jan 11, 2022

Nice. Thank you!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment