Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@Cilyan
Last active August 29, 2015 14:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Cilyan/71dc1a7751142eb5f205 to your computer and use it in GitHub Desktop.
Save Cilyan/71dc1a7751142eb5f205 to your computer and use it in GitHub Desktop.
Example parser using a stateful lexer and a recursive descent parser. Goal is to parse a C file containing structures, unions and variables (no functions) and some doxygen comments, building an AST. (Originally part of a bigger project.)
# Nodes for the Abstract Syntax Tree
class Node(object):
def __init__(self):
self.children = []
self.parent = None
for attr in self.__attributes__:
setattr(self, attr, None)
def __repr__(self):
main_attr = self.__attributes__[0]
return "<{class_} {attr_name}={attr_value}".format(
class_ = self.__class__.__name__,
attr_name = main_attr,
attr_value = getattr(self, main_attr)
)
def add_child(self, child_node):
self.children.append(child_node)
child_node.set_parent(self)
def set_parent(self, parent):
self.parent = parent
def print_tree(self, stream=sys.stdout, print_attrs = True, indent = 0):
stream.write(" " * indent)
stream.write("<{class_}".format(class_ = self.__class__.__name__))
for attr in self.__attributes__:
stream.write(" {attr_name}={attr_value}".format(
attr_name = attr,
attr_value = getattr(self, attr)
))
if not print_attrs:
# Trick to only print first attribute if print_attrs is not set
break
stream.write(">\n")
for child in self.children:
child.print_tree(stream, print_attrs, indent + 1)
class RootNode(Node):
__attributes__ = []
class StructNode(Node):
__attributes__ = ["typename", "name"]
class VariableNode(Node):
__attributes__ = ["name", "type", "description", "bfsize", "array"]
class UnionNode(Node):
__attributes__ = ["typename", "name"]
class DescriptionNode(Node):
__attributes__ = ["data"]
class ExprNode(Node):
__attributes__ = ["expr"]
class InterfaceParser:
"""
Parser that reads a C interface file containing structures, unions
and variables and builds an abstract syntax tree that can be then
analysed.
"""
# Stateful lexer with recursive descent parser
_regexes = {
"out": [
# C Code
("type", re.compile(
r"(SWORD|UWORD|SBYTE|UBYTE|SLONG|ULONG|BITFIELD(8|16|32))"
), None),
("struct", re.compile("struct"), None),
("union", re.compile("union"), None),
("number", re.compile(r"\d+"), None),
("identifier", re.compile(r"\w+"), None),
("bf_separator", re.compile(r":"), None),
("semicolon", re.compile(r";"), None),
# Parenthesis
("open_paren", re.compile(r"\("), None),
("close_paren", re.compile(r"\)"), None),
# Blocks
("open_block", re.compile(r"\{"), None),
("close_block", re.compile(r"\}"), None),
# Arrays
("open_array", re.compile(r"\["), None),
("close_array", re.compile(r"\]"), None),
# Doxygen style comment
(None, re.compile(r"/\*[*!]"), "doc"),
# Comment
(None, re.compile(r"/\*.*?\*/", re.DOTALL), None),
(None, re.compile(r"//.*"), None),
# Preprocessor macro
(None, re.compile(r"#.*"), None),
# Operators (low prio due to comment start)
("operator", re.compile(r"[-+*/]"), None),
# Space
(None, re.compile(r"\s+"), None),
],
"doc": [
# End of comment
(None, re.compile(r"\*/"), "out"),
# Start of description Doxygen style
(None, re.compile(r"(\\|@)brief"), "desc"),
# Garbage
(None, re.compile(r".", re.DOTALL), None),
],
"desc": [
# Other Doxygen tag
(None, re.compile(r"(\\|@)\w+"), "doc"),
# End of comment
(None, re.compile(r"\*/"), "out"),
# Documentation data, secured not to match end of comment/Doxy-tag
("docdata", re.compile(r"[^\\@*]+"), None),
("docdata", re.compile(r".", re.DOTALL), None),
],
}
_re_spaces = re.compile(r"\s+")
def __init__(self):
self.lexer = None
self.reset()
# Public Interface
def reset(self):
self.token = None
self.fullmatch = None
self.groups = None
self.nfullmatch = None
self.ngroups = None
self.root = RootNode()
def get_ast(self):
"""
Gets the Abstract Syntax Tree as parsed by parse()
"""
return self.root
def lex(self, source, initpos = 0):
"""
Lexer function, usually used internally by the parser.
Yields tokens found while analysing ``source``.
"""
pos = initpos
end = len(source)
state = "out"
while pos < end:
for token_name, reg, state_chng in self._regexes[state]:
# Try to get a match
match = reg.match(source, pos)
if match:
# Advance according to how much was matched
pos = match.end()
# yield a token if it has a name
if token_name is not None:
# Yield token name, the full matched part of source
# and the match grouped according to (?P<tag>) tags
yield (token_name, match.group(), match.groupdict())
# Switch state if requested
if state_chng is not None:
state = state_chng
break
else:
# the input file has an error in the syntax and lexer should
# yield an exception
raise RuntimeError("SyntaxError at position {}: '{}'".format(
pos, source[max(pos-5,0):min(pos+5,end)]
))
def parse(self, source, initpos = 0):
"""
Parses ``source`` filling the internal tree.
When the function finishes, you can get the tree using ``get_ast``.
"""
self.lexer = self.lex(source, initpos)
self._lexnext()
self._nt_file(self.root)
# Helpers
def _lexnext(self):
# Lex the next token
# Previous match is saved. This is because the parser usually has one
# token look ahead, so it needs the data matched together with the
# previous token.
self.fullmatch, self.groups = self.nfullmatch, self.ngroups
try:
self.token, self.nfullmatch, self.ngroups = next(self.lexer)
except StopIteration:
# Special end-of-stream token when the lexer runs out of tokens
self.token, self.nfullmatch, self.ngroups = "<EOS>", None, None
def _fail(self):
# Break parsing in case of syntax error
raise RuntimeError(
"Unexpected symbol {} found ({})".format(
self.token, self.nfullmatch
)
)
def _accept(self, token):
# Accepts a token: optional symbol
if self.token == token:
self._lexnext()
return True
return False
def _expect(self, token):
# Expects a token: mandatory, it fails if the token is wrong
if self.token == token:
self._lexnext()
return True
self._fail()
# Non-Terminals
def _nt_file(self, parentnode):
# FILE = {STMT}, <EOS> ;
while not self._accept("<EOS>"):
self._nt_stmt(parentnode)
def _nt_stmt(self, parentnode):
# STMT = STRUCT | UNION | VARIABLE | DESCRIPTION
if self._accept("struct"):
self._nt_struct(parentnode)
elif self._accept("union"):
self._nt_union(parentnode)
elif self._accept("type"):
self._nt_variable(parentnode)
elif self._accept("docdata"):
self._nt_description(parentnode)
else:
self._fail()
def _nt_variable(self, parentnode, type_=None):
# VARIABLE = (((struct|union), identifier) | type),
# identifier, [ ("[", EXPR, "]") | (":", number) ], ";" ;
var = VariableNode()
parentnode.add_child(var)
if type_ is None:
type_ = self.fullmatch
var.type = type_
self._expect("identifier")
var.name = self.fullmatch
var.array = False
if self._accept("bf_separator"):
self._expect("number")
var.bfsize = self.fullmatch
elif self._accept("open_array"):
self._nt_expr(var)
var.array = True
self._expect("close_array")
self._expect("semicolon")
def _nt_struct(self, parentnode):
# STRUCT = struct, [identifier], "{", {STMT}, "}", [identifier], ";";
struct = StructNode()
if self._accept("identifier"):
struct.typename = self.fullmatch
if self.token == "identifier":
self._nt_variable(parentnode, type_="struct " + struct.typename)
return
else:
struct.typename = None
parentnode.add_child(struct)
self._expect("open_block")
while not self._accept("close_block"):
self._nt_stmt(struct)
if self._accept("identifier"):
struct.name = self.fullmatch
self._expect("semicolon")
def _nt_union(self, parentnode):
# UNION = union, [identifier], "{", {STMT}, "}", [identifier], ";";
union = UnionNode()
if self._accept("identifier"):
union.typename = self.fullmatch
if self.token == "identifier":
self._nt_variable(parentnode, type_="union " + union.typename)
return
else:
union.typename = None
parentnode.add_child(union)
self._expect("open_block")
while not self._accept("close_block"):
self._nt_stmt(union)
if self._accept("identifier"):
union.name = self.fullmatch
self._expect("semicolon")
def _nt_description(self, parentnode):
# DESCRIPTION = {docdata};
desc = DescriptionNode()
parentnode.add_child(desc)
desc.data = self.fullmatch.replace("\n", " ").strip()
while self._accept("docdata"):
desc.data += self.fullmatch.replace("\n", " ").strip()
desc.data = self._re_spaces.sub(" ", desc.data)
def _nt_expr(self, parentnode):
# EXPR = ( "(", EXPR, ")" ) | (identifier, [ operator, EXPR ]) ;
def sub_expr(node):
if self._accept("open_paren"):
node.expr += self.fullmatch
sub_expr(node)
self._expect("close_paren")
node.expr += self.fullmatch
elif self._accept("identifier"):
node.expr += self.fullmatch
if self._accept("operator"):
node.expr += self.fullmatch
sub_expr(node)
else:
self._fail()
par = False
node = ExprNode()
node.expr = ""
parentnode.add_child(node)
sub_expr(node)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment