Cilyan/recursivedescent.py

## recursivedescent.py
# Nodes for the Abstract Syntax Tree
class Node(object):
    def __init__(self):
        self.children = []
        self.parent = None
        for attr in self.__attributes__:
            setattr(self, attr, None)
    def __repr__(self):
        main_attr = self.__attributes__[0]
        return "<{class_} {attr_name}={attr_value}".format(
            class_ = self.__class__.__name__,
            attr_name = main_attr,
            attr_value = getattr(self, main_attr)
        )
    def add_child(self, child_node):
        self.children.append(child_node)
        child_node.set_parent(self)
    def set_parent(self, parent):
        self.parent = parent
    def print_tree(self, stream=sys.stdout, print_attrs = True, indent = 0):
        stream.write("  " * indent)
        stream.write("<{class_}".format(class_ = self.__class__.__name__))
        for attr in self.__attributes__:
            stream.write(" {attr_name}={attr_value}".format(
                attr_name = attr,
                attr_value = getattr(self, attr)
            ))
            if not print_attrs:
                # Trick to only print first attribute if print_attrs is not set
                break
        stream.write(">\n")
        for child in self.children:
            child.print_tree(stream, print_attrs, indent + 1)

class RootNode(Node):
    __attributes__ = []
class StructNode(Node):
    __attributes__ = ["typename", "name"]
class VariableNode(Node):
    __attributes__ = ["name", "type", "description", "bfsize", "array"]
class UnionNode(Node):
    __attributes__ = ["typename", "name"]
class DescriptionNode(Node):
    __attributes__ = ["data"]
class ExprNode(Node):
    __attributes__ = ["expr"]


class InterfaceParser:
    """
        Parser that reads a C interface file containing structures, unions
        and variables and builds an abstract syntax tree that can be then
        analysed.
    """
    # Stateful lexer with recursive descent parser
    _regexes = {
        "out": [
            # C Code
            ("type", re.compile(
                    r"(SWORD|UWORD|SBYTE|UBYTE|SLONG|ULONG|BITFIELD(8|16|32))"
                ), None),
            ("struct", re.compile("struct"), None),
            ("union", re.compile("union"), None),
            ("number", re.compile(r"\d+"), None),
            ("identifier", re.compile(r"\w+"), None),
            ("bf_separator", re.compile(r":"), None),
            ("semicolon", re.compile(r";"), None),
            # Parenthesis
            ("open_paren", re.compile(r"\("), None),
            ("close_paren", re.compile(r"\)"), None),
            # Blocks
            ("open_block", re.compile(r"\{"), None),
            ("close_block", re.compile(r"\}"), None),
            # Arrays
            ("open_array", re.compile(r"\["), None),
            ("close_array", re.compile(r"\]"), None),
            # Doxygen style comment
            (None, re.compile(r"/\*[*!]"), "doc"),
            # Comment
            (None, re.compile(r"/\*.*?\*/", re.DOTALL), None),
            (None, re.compile(r"//.*"), None),
            # Preprocessor macro
            (None, re.compile(r"#.*"), None),
            # Operators (low prio due to comment start)
            ("operator", re.compile(r"[-+*/]"), None),
            # Space
            (None, re.compile(r"\s+"), None),
        ],
        "doc": [
            # End of comment
            (None, re.compile(r"\*/"), "out"),
            # Start of description Doxygen style
            (None, re.compile(r"(\\|@)brief"), "desc"),
            # Garbage
            (None, re.compile(r".", re.DOTALL), None),
        ],
        "desc": [
            # Other Doxygen tag
            (None, re.compile(r"(\\|@)\w+"), "doc"),
            # End of comment
            (None, re.compile(r"\*/"), "out"),
            # Documentation data, secured not to match end of comment/Doxy-tag
            ("docdata", re.compile(r"[^\\@*]+"), None),
            ("docdata", re.compile(r".", re.DOTALL), None),
        ],
    }

    _re_spaces = re.compile(r"\s+")

    def __init__(self):
        self.lexer = None
        self.reset()

    # Public Interface
    def reset(self):
        self.token = None
        self.fullmatch = None
        self.groups = None
        self.nfullmatch = None
        self.ngroups = None
        self.root = RootNode()

    def get_ast(self):
        """
            Gets the Abstract Syntax Tree as parsed by parse()
        """
        return self.root

    def lex(self, source, initpos = 0):
        """
            Lexer function, usually used internally by the parser.
            Yields tokens found while analysing ``source``.
        """
        pos = initpos
        end = len(source)
        state = "out"
        while pos < end:
            for token_name, reg, state_chng in self._regexes[state]:
                # Try to get a match
                match = reg.match(source, pos)
                if match:
                    # Advance according to how much was matched
                    pos = match.end()
                    # yield a token if it has a name
                    if token_name is not None:
                        # Yield token name, the full matched part of source
                        # and the match grouped according to (?P<tag>) tags
                        yield (token_name, match.group(), match.groupdict())
                    # Switch state if requested
                    if state_chng is not None:
                        state = state_chng
                    break
            else:
                # the input file has an error in the syntax and lexer should
                # yield an exception
                raise RuntimeError("SyntaxError at position {}: '{}'".format(
                    pos, source[max(pos-5,0):min(pos+5,end)]
                ))

    def parse(self, source, initpos = 0):
        """
            Parses ``source`` filling the internal tree.
            When the function finishes, you can get the tree using ``get_ast``.
        """
        self.lexer = self.lex(source, initpos)
        self._lexnext()
        self._nt_file(self.root)

    # Helpers
    def _lexnext(self):
        # Lex the next token
        # Previous match is saved. This is because the parser usually has one
        # token look ahead, so it needs the data matched together with the
        # previous token.
        self.fullmatch, self.groups = self.nfullmatch, self.ngroups
        try:
            self.token, self.nfullmatch, self.ngroups = next(self.lexer)
        except StopIteration:
            # Special end-of-stream token when the lexer runs out of tokens
            self.token, self.nfullmatch, self.ngroups = "<EOS>", None, None

    def _fail(self):
        # Break parsing in case of syntax error
        raise RuntimeError(
            "Unexpected symbol {} found ({})".format(
                self.token, self.nfullmatch
            )
        )

    def _accept(self, token):
        # Accepts a token: optional symbol
        if self.token == token:
            self._lexnext()
            return True
        return False

    def _expect(self, token):
        # Expects a token: mandatory, it fails if the token is wrong
        if self.token == token:
            self._lexnext()
            return True
        self._fail()

    # Non-Terminals
    def _nt_file(self, parentnode):
        # FILE = {STMT}, <EOS> ;
        while not self._accept("<EOS>"):
                self._nt_stmt(parentnode)

    def _nt_stmt(self, parentnode):
        # STMT = STRUCT | UNION | VARIABLE | DESCRIPTION
        if self._accept("struct"):
            self._nt_struct(parentnode)
        elif self._accept("union"):
            self._nt_union(parentnode)
        elif self._accept("type"):
            self._nt_variable(parentnode)
        elif self._accept("docdata"):
            self._nt_description(parentnode)
        else:
            self._fail()

    def _nt_variable(self, parentnode, type_=None):
        # VARIABLE = (((struct|union), identifier) | type),
        #       identifier, [ ("[", EXPR, "]") | (":", number) ], ";" ;
        var = VariableNode()
        parentnode.add_child(var)
        if type_ is None:
            type_ = self.fullmatch
        var.type = type_
        self._expect("identifier")
        var.name = self.fullmatch
        var.array = False
        if self._accept("bf_separator"):
            self._expect("number")
            var.bfsize = self.fullmatch
        elif self._accept("open_array"):
            self._nt_expr(var)
            var.array = True
            self._expect("close_array")
        self._expect("semicolon")

    def _nt_struct(self, parentnode):
        # STRUCT = struct, [identifier], "{", {STMT}, "}", [identifier], ";";
        struct = StructNode()
        if self._accept("identifier"):
            struct.typename = self.fullmatch
            if self.token == "identifier":
                self._nt_variable(parentnode, type_="struct " + struct.typename)
                return
        else:
            struct.typename = None
        parentnode.add_child(struct)
        self._expect("open_block")
        while not self._accept("close_block"):
            self._nt_stmt(struct)
        if self._accept("identifier"):
            struct.name = self.fullmatch
        self._expect("semicolon")

    def _nt_union(self, parentnode):
        # UNION = union, [identifier], "{", {STMT}, "}", [identifier], ";";
        union = UnionNode()
        if self._accept("identifier"):
            union.typename = self.fullmatch
            if self.token == "identifier":
                self._nt_variable(parentnode, type_="union " + union.typename)
                return
        else:
            union.typename = None
        parentnode.add_child(union)
        self._expect("open_block")
        while not self._accept("close_block"):
            self._nt_stmt(union)
        if self._accept("identifier"):
            union.name = self.fullmatch
        self._expect("semicolon")

    def _nt_description(self, parentnode):
        # DESCRIPTION = {docdata};
        desc = DescriptionNode()
        parentnode.add_child(desc)
        desc.data = self.fullmatch.replace("\n", " ").strip()
        while self._accept("docdata"):
            desc.data += self.fullmatch.replace("\n", " ").strip()
        desc.data = self._re_spaces.sub(" ", desc.data)

    def _nt_expr(self, parentnode):
        # EXPR = ( "(", EXPR, ")" ) | (identifier, [ operator, EXPR ]) ;
        def sub_expr(node):
            if self._accept("open_paren"):
                node.expr += self.fullmatch
                sub_expr(node)
                self._expect("close_paren")
                node.expr += self.fullmatch
            elif self._accept("identifier"):
                node.expr += self.fullmatch
                if self._accept("operator"):
                    node.expr += self.fullmatch
                    sub_expr(node)
            else:
                self._fail()
        par = False
        node = ExprNode()
        node.expr = ""
        parentnode.add_child(node)
        sub_expr(node)
	# Nodes for the Abstract Syntax Tree
	class Node(object):
	def __init__(self):
	self.children = []
	self.parent = None
	for attr in self.__attributes__:
	setattr(self, attr, None)
	def __repr__(self):
	main_attr = self.__attributes__[0]
	return "<{class_} {attr_name}={attr_value}".format(
	class_ = self.__class__.__name__,
	attr_name = main_attr,
	attr_value = getattr(self, main_attr)
	)
	def add_child(self, child_node):
	self.children.append(child_node)
	child_node.set_parent(self)
	def set_parent(self, parent):
	self.parent = parent
	def print_tree(self, stream=sys.stdout, print_attrs = True, indent = 0):
	stream.write(" " * indent)
	stream.write("<{class_}".format(class_ = self.__class__.__name__))
	for attr in self.__attributes__:
	stream.write(" {attr_name}={attr_value}".format(
	attr_name = attr,
	attr_value = getattr(self, attr)
	))
	if not print_attrs:
	# Trick to only print first attribute if print_attrs is not set
	break
	stream.write(">\n")
	for child in self.children:
	child.print_tree(stream, print_attrs, indent + 1)

	class RootNode(Node):
	__attributes__ = []
	class StructNode(Node):
	__attributes__ = ["typename", "name"]
	class VariableNode(Node):
	__attributes__ = ["name", "type", "description", "bfsize", "array"]
	class UnionNode(Node):
	__attributes__ = ["typename", "name"]
	class DescriptionNode(Node):
	__attributes__ = ["data"]
	class ExprNode(Node):
	__attributes__ = ["expr"]


	class InterfaceParser:
	"""
	Parser that reads a C interface file containing structures, unions
	and variables and builds an abstract syntax tree that can be then
	analysed.
	"""
	# Stateful lexer with recursive descent parser
	_regexes = {
	"out": [
	# C Code
	("type", re.compile(
	r"(SWORD\|UWORD\|SBYTE\|UBYTE\|SLONG\|ULONG\|BITFIELD(8\|16\|32))"
	), None),
	("struct", re.compile("struct"), None),
	("union", re.compile("union"), None),
	("number", re.compile(r"\d+"), None),
	("identifier", re.compile(r"\w+"), None),
	("bf_separator", re.compile(r":"), None),
	("semicolon", re.compile(r";"), None),
	# Parenthesis
	("open_paren", re.compile(r"\("), None),
	("close_paren", re.compile(r"\)"), None),
	# Blocks
	("open_block", re.compile(r"\{"), None),
	("close_block", re.compile(r"\}"), None),
	# Arrays
	("open_array", re.compile(r"\["), None),
	("close_array", re.compile(r"\]"), None),
	# Doxygen style comment
	(None, re.compile(r"/\[!]"), "doc"),
	# Comment
	(None, re.compile(r"/\.?\*/", re.DOTALL), None),
	(None, re.compile(r"//.*"), None),
	# Preprocessor macro
	(None, re.compile(r"#.*"), None),
	# Operators (low prio due to comment start)
	("operator", re.compile(r"[-+*/]"), None),
	# Space
	(None, re.compile(r"\s+"), None),
	],
	"doc": [
	# End of comment
	(None, re.compile(r"\*/"), "out"),
	# Start of description Doxygen style
	(None, re.compile(r"(\\\|@)brief"), "desc"),
	# Garbage
	(None, re.compile(r".", re.DOTALL), None),
	],
	"desc": [
	# Other Doxygen tag
	(None, re.compile(r"(\\\|@)\w+"), "doc"),
	# End of comment
	(None, re.compile(r"\*/"), "out"),
	# Documentation data, secured not to match end of comment/Doxy-tag
	("docdata", re.compile(r"[^\\@*]+"), None),
	("docdata", re.compile(r".", re.DOTALL), None),
	],
	}

	_re_spaces = re.compile(r"\s+")

	def __init__(self):
	self.lexer = None
	self.reset()

	# Public Interface
	def reset(self):
	self.token = None
	self.fullmatch = None
	self.groups = None
	self.nfullmatch = None
	self.ngroups = None
	self.root = RootNode()

	def get_ast(self):
	"""
	Gets the Abstract Syntax Tree as parsed by parse()
	"""
	return self.root

	def lex(self, source, initpos = 0):
	"""
	Lexer function, usually used internally by the parser.
	Yields tokens found while analysing ``source``.
	"""
	pos = initpos
	end = len(source)
	state = "out"
	while pos < end:
	for token_name, reg, state_chng in self._regexes[state]:
	# Try to get a match
	match = reg.match(source, pos)
	if match:
	# Advance according to how much was matched
	pos = match.end()
	# yield a token if it has a name
	if token_name is not None:
	# Yield token name, the full matched part of source
	# and the match grouped according to (?P<tag>) tags
	yield (token_name, match.group(), match.groupdict())
	# Switch state if requested
	if state_chng is not None:
	state = state_chng
	break
	else:
	# the input file has an error in the syntax and lexer should
	# yield an exception
	raise RuntimeError("SyntaxError at position {}: '{}'".format(
	pos, source[max(pos-5,0):min(pos+5,end)]
	))

	def parse(self, source, initpos = 0):
	"""
	Parses ``source`` filling the internal tree.
	When the function finishes, you can get the tree using ``get_ast``.
	"""
	self.lexer = self.lex(source, initpos)
	self._lexnext()
	self._nt_file(self.root)

	# Helpers
	def _lexnext(self):
	# Lex the next token
	# Previous match is saved. This is because the parser usually has one
	# token look ahead, so it needs the data matched together with the
	# previous token.
	self.fullmatch, self.groups = self.nfullmatch, self.ngroups
	try:
	self.token, self.nfullmatch, self.ngroups = next(self.lexer)
	except StopIteration:
	# Special end-of-stream token when the lexer runs out of tokens
	self.token, self.nfullmatch, self.ngroups = "<EOS>", None, None

	def _fail(self):
	# Break parsing in case of syntax error
	raise RuntimeError(
	"Unexpected symbol {} found ({})".format(
	self.token, self.nfullmatch
	)
	)

	def _accept(self, token):
	# Accepts a token: optional symbol
	if self.token == token:
	self._lexnext()
	return True
	return False

	def _expect(self, token):
	# Expects a token: mandatory, it fails if the token is wrong
	if self.token == token:
	self._lexnext()
	return True
	self._fail()

	# Non-Terminals
	def _nt_file(self, parentnode):
	# FILE = {STMT}, <EOS> ;
	while not self._accept("<EOS>"):
	self._nt_stmt(parentnode)

	def _nt_stmt(self, parentnode):
	# STMT = STRUCT \| UNION \| VARIABLE \| DESCRIPTION
	if self._accept("struct"):
	self._nt_struct(parentnode)
	elif self._accept("union"):
	self._nt_union(parentnode)
	elif self._accept("type"):
	self._nt_variable(parentnode)
	elif self._accept("docdata"):
	self._nt_description(parentnode)
	else:
	self._fail()

	def _nt_variable(self, parentnode, type_=None):
	# VARIABLE = (((struct\|union), identifier) \| type),
	# identifier, [ ("[", EXPR, "]") \| (":", number) ], ";" ;
	var = VariableNode()
	parentnode.add_child(var)
	if type_ is None:
	type_ = self.fullmatch
	var.type = type_
	self._expect("identifier")
	var.name = self.fullmatch
	var.array = False
	if self._accept("bf_separator"):
	self._expect("number")
	var.bfsize = self.fullmatch
	elif self._accept("open_array"):
	self._nt_expr(var)
	var.array = True
	self._expect("close_array")
	self._expect("semicolon")

	def _nt_struct(self, parentnode):
	# STRUCT = struct, [identifier], "{", {STMT}, "}", [identifier], ";";
	struct = StructNode()
	if self._accept("identifier"):
	struct.typename = self.fullmatch
	if self.token == "identifier":
	self._nt_variable(parentnode, type_="struct " + struct.typename)
	return
	else:
	struct.typename = None
	parentnode.add_child(struct)
	self._expect("open_block")
	while not self._accept("close_block"):
	self._nt_stmt(struct)
	if self._accept("identifier"):
	struct.name = self.fullmatch
	self._expect("semicolon")

	def _nt_union(self, parentnode):
	# UNION = union, [identifier], "{", {STMT}, "}", [identifier], ";";
	union = UnionNode()
	if self._accept("identifier"):
	union.typename = self.fullmatch
	if self.token == "identifier":
	self._nt_variable(parentnode, type_="union " + union.typename)
	return
	else:
	union.typename = None
	parentnode.add_child(union)
	self._expect("open_block")
	while not self._accept("close_block"):
	self._nt_stmt(union)
	if self._accept("identifier"):
	union.name = self.fullmatch
	self._expect("semicolon")

	def _nt_description(self, parentnode):
	# DESCRIPTION = {docdata};
	desc = DescriptionNode()
	parentnode.add_child(desc)
	desc.data = self.fullmatch.replace("\n", " ").strip()
	while self._accept("docdata"):
	desc.data += self.fullmatch.replace("\n", " ").strip()
	desc.data = self._re_spaces.sub(" ", desc.data)

	def _nt_expr(self, parentnode):
	# EXPR = ( "(", EXPR, ")" ) \| (identifier, [ operator, EXPR ]) ;
	def sub_expr(node):
	if self._accept("open_paren"):
	node.expr += self.fullmatch
	sub_expr(node)
	self._expect("close_paren")
	node.expr += self.fullmatch
	elif self._accept("identifier"):
	node.expr += self.fullmatch
	if self._accept("operator"):
	node.expr += self.fullmatch
	sub_expr(node)
	else:
	self._fail()
	par = False
	node = ExprNode()
	node.expr = ""
	parentnode.add_child(node)
	sub_expr(node)