caudamus/README.md

## README.md

      
    Raw
  

              README.md
            
          
    BNF parsing

Toy weekend project: make a parser-parser!
This takes in a grammar given in Bakus-Naur Form (BNF),
and a program that was written in the provided grammar,
and produces an image of Syntax Tree using graphviz.
References:

https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form
https://en.wikipedia.org/wiki/Abstract_syntax_tree
https://en.wikipedia.org/wiki/Compilers:_Principles,_Techniques,_and_Tools

Examples

Simple arithmetic grammar from chapter 2 of the dragon book.

simple.bnf - simple arithmetic grammar
simple.txt - example arithmetic
simple.png - generated with ./bnf.py simple.bnf simple.txt simple

Grammar for the grammar definitions themselves, which is both grammar and program.

bnf.bnf - grammar for BNF grammars
bnf.png - generated with ./bnf.py bnf.bnf bnf.bnf bnf

Toy Language (TL) from University of Texas at San Antonio.

tl.bnf - grammar for TL
example.tl - example program in TL
tl.png - generated with ./bnf.py tl.bnf example.tl tl


## bnf.bnf
grammar    = rule grammar | ;
rule       = ruleName "=" expression ";" ;
expression = list "|" expression | list | ;
list       = term list | ;
term       = literal | ruleName ;
ruleName   = identifier ;

## bnf.png

      
    Raw
  

              bnf.png
            
          
## bnf.py
#!/usr/bin/env python

import re
import argparse
from collections import namedtuple
from graphviz import Digraph

# Utility types to keep things a little bit neater
Token = namedtuple('Token', ['token', 'value'])
Rule = namedtuple('Rule', ['name', 'expression'])
Grammar = namedtuple('Grammar', ['root', 'rules'])
Tree = namedtuple('Tree', ['root', 'children', 'length'])

terminals = ['identifier', 'integer', 'symbol', 'literal']


def tokenize(s):
    ''' Return a list of tokens from a string '''
    tokens = []
    offset = 0  # Offset into string
    s = re.sub(r'%[^\n]*\n', '', s)  # Remove comments
    while offset < len(s):
        # Newlines and spaces (ignored)
        if s[offset] == '\n' or s[offset] == ' ':
            offset = offset + 1
            continue
        # Identifiers
        match = re.match(r'^[a-z_A-Z][a-zA-Z0-9]*', s[offset:])
        if match is not None:
            tokens.append(Token('identifier', match.group()))
            offset = offset + len(match.group())
            continue
        # Integers
        match = re.match(r'^[1-9][0-9]*|0', s[offset:])
        if match is not None:
            tokens.append(Token('integer', match.group()))
            offset = offset + len(match.group())
            continue
        # Symbols
        match = re.match(r'^[=|;:!<>\+\-\*\/\(\)]+', s[offset:])
        if match is not None:
            tokens.append(Token('symbol', match.group()))
            offset = offset + len(match.group())
            continue
        # Literals
        match = re.match(r'^"([^"\n]*)"', s[offset:])
        if match is not None:
            tokens.append(Token('literal', match.groups()[0]))
            offset = offset + len(match.group())
            continue
        # Error case
        raise ValueError('Unable to tokenize: {}'.format(s[offset:]))
    return tokens


def is_term(token, symbol):
    ''' Return true if token is the given terminal symbol '''
    return token.token == 'symbol' and token.value == symbol


def parse_grammar(tokens):
    ''' Parse a list of tokens into a BNF grammar '''
    rules = {}
    total = 0
    first_rule = None
    while total < len(tokens):
        # Parse rule name
        assert tokens[total].token == 'identifier', '{} {}'.format(
            tokens[total], total)
        name = tokens[total].value
        if first_rule is None:
            first_rule = name
        total += 1
        # Parse assignment symbol
        assert is_term(tokens[total], '=')
        total += 1
        # Parse expression
        expression = []
        while True:
            # Parse list
            list_ = []
            while not (is_term(tokens[total], '|') or is_term(tokens[total], ';')):
                list_.append(tokens[total].value)
                total += 1
            expression.append(list_)
            if is_term(tokens[total], ';'):
                break
            if is_term(tokens[total], '|'):
                total += 1
        # Parse terminating semicolon
        assert is_term(tokens[total], ';')
        total += 1
        # Add rule to list
        rules[name] = expression
    return Grammar(first_rule, rules)


def parse(grammar, tokens, items=None):
    ''' Given a grammar, parse a list of tokens into syntax trees given nonterminals '''
    if items is None:
        items = [grammar.root]
    if len(items) == 0:
        return []
    if len(tokens) == 0:
        return None

    item, *tail = items

    if item not in grammar.rules:
        if ((item in terminals and tokens[0].token == item) or
                (item == tokens[0].value and tokens[0].token in terminals)):
            tail_trees = parse(grammar, tokens[1:], tail)
            if tail_trees is not None:
                root = item
                if item == tokens[0].token and tokens[0].token != 'literal':
                    root = root + '\n' + tokens[0].value
                return [Tree(root, [], 1)] + tail_trees
        return None

    for exp in grammar.rules[item]:
        exp_trees = parse(grammar, tokens, exp)
        if exp_trees is None:
            continue
        offset = sum(tree.length for tree in exp_trees)
        tail_trees = parse(grammar, tokens[offset:], tail)
        if tail_trees is None:
            continue
        return [Tree(item, exp_trees, offset)] + tail_trees
    return None


def render(tree, filepath):
    ''' Draw the syntax tree as a graph to an output image file '''
    dot = Digraph(format='png')
    to_visit = [tree]
    while to_visit:
        node = to_visit.pop(0)
        dot.node(str(id(node)), label=str(node.root))
        for child in node.children:
            if isinstance(child, Tree):
                dot.edge(str(id(node)), str(id(child)))
                to_visit.append(child)
    dot.render(filepath, cleanup=True)


def main():
    parser = argparse.ArgumentParser(description='Make syntax tree graphs')
    parser.add_argument('grammar_file', help='BNF-encoded grammar to parse')
    parser.add_argument('program_file', help='program to parse with grammar')
    parser.add_argument('output_image', help='image file to save syntax tree')
    args = parser.parse_args()

    grammar = parse_grammar(tokenize(open(args.grammar_file).read()))
    program = tokenize(open(args.program_file).read())
    syntax, = parse(grammar, program)
    render(syntax, args.output_image)


if __name__ == '__main__':
    main()

## example.tl
program
    var N as int ;
    var SQRT as int ;
begin
    N := readInt ;
    SQRT := 0 ;

    % go until SQRT exceeds the square root of N
    while SQRT * SQRT <= N do
        SQRT := SQRT + 1 ;
    end ;

    SQRT := SQRT - 1 ; % subtract one SQRT is <= sqrt(N)

    writeInt SQRT ;
end

## simple.bnf
expr = term "+" expr | term "-" expr | term ;
term = factor "*" term | factor "/" term | factor ;
factor = integer | "(" expr ")" ;

## simple.png

      
    Raw
  

              simple.png
            
          
## simple.txt
9 / 3 + 5 * (2 - 0)

## tl.bnf
programText = "program" declarations "begin" statementSequence "end" ;
declarations = "var" identifier "as" type ";" declarations | ;
type = "int" | "bool" ;

statementSequence = statement ";" statementSequence | ;
statement = assignment | ifStatement | whileStatement | writeIntExpr ;
assignment = identifier ":=" "readInt" | identifier ":=" expression ;
ifStatement = "if" expression "then" statementSequence elseClause "end" ;
elseClause = "else" statementSequence | ;
whileStatement = "while" expression "do" statementSequence "end" ;
writeIntExpr = "writeInt" expression ;

expression = simpleExpression | simpleExpression compare expression ;
simpleExpression = term additive simpleExpression | term ;
term = factor multiplicative term | factor ;
factor = identifier | integer | boolean | "(" expression ")" ;

compare = "=" | "!=" | "<" | ">" | "<=" | ">=" ;
additive = "+" | "-" ;
multiplicative = "*" | "div" | "mod" ;
boolean = "true" | "false" ;

## tl.png

      
    Raw
  

              tl.png
	grammar = rule grammar \| ;
	rule = ruleName "=" expression ";" ;
	expression = list "\|" expression \| list \| ;
	list = term list \| ;
	term = literal \| ruleName ;
	ruleName = identifier ;
	#!/usr/bin/env python

	import re
	import argparse
	from collections import namedtuple
	from graphviz import Digraph

	# Utility types to keep things a little bit neater
	Token = namedtuple('Token', ['token', 'value'])
	Rule = namedtuple('Rule', ['name', 'expression'])
	Grammar = namedtuple('Grammar', ['root', 'rules'])
	Tree = namedtuple('Tree', ['root', 'children', 'length'])

	terminals = ['identifier', 'integer', 'symbol', 'literal']


	def tokenize(s):
	''' Return a list of tokens from a string '''
	tokens = []
	offset = 0 # Offset into string
	s = re.sub(r'%[^\n]*\n', '', s) # Remove comments
	while offset < len(s):
	# Newlines and spaces (ignored)
	if s[offset] == '\n' or s[offset] == ' ':
	offset = offset + 1
	continue
	# Identifiers
	match = re.match(r'^[a-z_A-Z][a-zA-Z0-9]*', s[offset:])
	if match is not None:
	tokens.append(Token('identifier', match.group()))
	offset = offset + len(match.group())
	continue
	# Integers
	match = re.match(r'^[1-9][0-9]*\|0', s[offset:])
	if match is not None:
	tokens.append(Token('integer', match.group()))
	offset = offset + len(match.group())
	continue
	# Symbols
	match = re.match(r'^[=\|;:!<>\+\-\*\/\(\)]+', s[offset:])
	if match is not None:
	tokens.append(Token('symbol', match.group()))
	offset = offset + len(match.group())
	continue
	# Literals
	match = re.match(r'^"([^"\n]*)"', s[offset:])
	if match is not None:
	tokens.append(Token('literal', match.groups()[0]))
	offset = offset + len(match.group())
	continue
	# Error case
	raise ValueError('Unable to tokenize: {}'.format(s[offset:]))
	return tokens


	def is_term(token, symbol):
	''' Return true if token is the given terminal symbol '''
	return token.token == 'symbol' and token.value == symbol


	def parse_grammar(tokens):
	''' Parse a list of tokens into a BNF grammar '''
	rules = {}
	total = 0
	first_rule = None
	while total < len(tokens):
	# Parse rule name
	assert tokens[total].token == 'identifier', '{} {}'.format(
	tokens[total], total)
	name = tokens[total].value
	if first_rule is None:
	first_rule = name
	total += 1
	# Parse assignment symbol
	assert is_term(tokens[total], '=')
	total += 1
	# Parse expression
	expression = []
	while True:
	# Parse list
	list_ = []
	while not (is_term(tokens[total], '\|') or is_term(tokens[total], ';')):
	list_.append(tokens[total].value)
	total += 1
	expression.append(list_)
	if is_term(tokens[total], ';'):
	break
	if is_term(tokens[total], '\|'):
	total += 1
	# Parse terminating semicolon
	assert is_term(tokens[total], ';')
	total += 1
	# Add rule to list
	rules[name] = expression
	return Grammar(first_rule, rules)


	def parse(grammar, tokens, items=None):
	''' Given a grammar, parse a list of tokens into syntax trees given nonterminals '''
	if items is None:
	items = [grammar.root]
	if len(items) == 0:
	return []
	if len(tokens) == 0:
	return None

	item, *tail = items

	if item not in grammar.rules:
	if ((item in terminals and tokens[0].token == item) or
	(item == tokens[0].value and tokens[0].token in terminals)):
	tail_trees = parse(grammar, tokens[1:], tail)
	if tail_trees is not None:
	root = item
	if item == tokens[0].token and tokens[0].token != 'literal':
	root = root + '\n' + tokens[0].value
	return [Tree(root, [], 1)] + tail_trees
	return None

	for exp in grammar.rules[item]:
	exp_trees = parse(grammar, tokens, exp)
	if exp_trees is None:
	continue
	offset = sum(tree.length for tree in exp_trees)
	tail_trees = parse(grammar, tokens[offset:], tail)
	if tail_trees is None:
	continue
	return [Tree(item, exp_trees, offset)] + tail_trees
	return None


	def render(tree, filepath):
	''' Draw the syntax tree as a graph to an output image file '''
	dot = Digraph(format='png')
	to_visit = [tree]
	while to_visit:
	node = to_visit.pop(0)
	dot.node(str(id(node)), label=str(node.root))
	for child in node.children:
	if isinstance(child, Tree):
	dot.edge(str(id(node)), str(id(child)))
	to_visit.append(child)
	dot.render(filepath, cleanup=True)


	def main():
	parser = argparse.ArgumentParser(description='Make syntax tree graphs')
	parser.add_argument('grammar_file', help='BNF-encoded grammar to parse')
	parser.add_argument('program_file', help='program to parse with grammar')
	parser.add_argument('output_image', help='image file to save syntax tree')
	args = parser.parse_args()

	grammar = parse_grammar(tokenize(open(args.grammar_file).read()))
	program = tokenize(open(args.program_file).read())
	syntax, = parse(grammar, program)
	render(syntax, args.output_image)


	if __name__ == '__main__':
	main()
	program
	var N as int ;
	var SQRT as int ;
	begin
	N := readInt ;
	SQRT := 0 ;

	% go until SQRT exceeds the square root of N
	while SQRT * SQRT <= N do
	SQRT := SQRT + 1 ;
	end ;

	SQRT := SQRT - 1 ; % subtract one SQRT is <= sqrt(N)

	writeInt SQRT ;
	end
	expr = term "+" expr \| term "-" expr \| term ;
	term = factor "*" term \| factor "/" term \| factor ;
	factor = integer \| "(" expr ")" ;
	programText = "program" declarations "begin" statementSequence "end" ;
	declarations = "var" identifier "as" type ";" declarations \| ;
	type = "int" \| "bool" ;

	statementSequence = statement ";" statementSequence \| ;
	statement = assignment \| ifStatement \| whileStatement \| writeIntExpr ;
	assignment = identifier ":=" "readInt" \| identifier ":=" expression ;
	ifStatement = "if" expression "then" statementSequence elseClause "end" ;
	elseClause = "else" statementSequence \| ;
	whileStatement = "while" expression "do" statementSequence "end" ;
	writeIntExpr = "writeInt" expression ;

	expression = simpleExpression \| simpleExpression compare expression ;
	simpleExpression = term additive simpleExpression \| term ;
	term = factor multiplicative term \| factor ;
	factor = identifier \| integer \| boolean \| "(" expression ")" ;

	compare = "=" \| "!=" \| "<" \| ">" \| "<=" \| ">=" ;
	additive = "+" \| "-" ;
	multiplicative = "*" \| "div" \| "mod" ;
	boolean = "true" \| "false" ;