Example parser using a stateful lexer and a recursive descent parser. Goal is to parse a C file containing structures, unions and variables (no functions) and some doxygen comments, building an AST. (Originally part of a bigger project.)
# Nodes for the Abstract Syntax Tree | |
class Node(object): | |
def __init__(self): | |
self.children = [] | |
self.parent = None | |
for attr in self.__attributes__: | |
setattr(self, attr, None) | |
def __repr__(self): | |
main_attr = self.__attributes__[0] | |
return "<{class_} {attr_name}={attr_value}".format( | |
class_ = self.__class__.__name__, | |
attr_name = main_attr, | |
attr_value = getattr(self, main_attr) | |
) | |
def add_child(self, child_node): | |
self.children.append(child_node) | |
child_node.set_parent(self) | |
def set_parent(self, parent): | |
self.parent = parent | |
def print_tree(self, stream=sys.stdout, print_attrs = True, indent = 0): | |
stream.write(" " * indent) | |
stream.write("<{class_}".format(class_ = self.__class__.__name__)) | |
for attr in self.__attributes__: | |
stream.write(" {attr_name}={attr_value}".format( | |
attr_name = attr, | |
attr_value = getattr(self, attr) | |
)) | |
if not print_attrs: | |
# Trick to only print first attribute if print_attrs is not set | |
break | |
stream.write(">\n") | |
for child in self.children: | |
child.print_tree(stream, print_attrs, indent + 1) | |
class RootNode(Node): | |
__attributes__ = [] | |
class StructNode(Node): | |
__attributes__ = ["typename", "name"] | |
class VariableNode(Node): | |
__attributes__ = ["name", "type", "description", "bfsize", "array"] | |
class UnionNode(Node): | |
__attributes__ = ["typename", "name"] | |
class DescriptionNode(Node): | |
__attributes__ = ["data"] | |
class ExprNode(Node): | |
__attributes__ = ["expr"] | |
class InterfaceParser: | |
""" | |
Parser that reads a C interface file containing structures, unions | |
and variables and builds an abstract syntax tree that can be then | |
analysed. | |
""" | |
# Stateful lexer with recursive descent parser | |
_regexes = { | |
"out": [ | |
# C Code | |
("type", re.compile( | |
r"(SWORD|UWORD|SBYTE|UBYTE|SLONG|ULONG|BITFIELD(8|16|32))" | |
), None), | |
("struct", re.compile("struct"), None), | |
("union", re.compile("union"), None), | |
("number", re.compile(r"\d+"), None), | |
("identifier", re.compile(r"\w+"), None), | |
("bf_separator", re.compile(r":"), None), | |
("semicolon", re.compile(r";"), None), | |
# Parenthesis | |
("open_paren", re.compile(r"\("), None), | |
("close_paren", re.compile(r"\)"), None), | |
# Blocks | |
("open_block", re.compile(r"\{"), None), | |
("close_block", re.compile(r"\}"), None), | |
# Arrays | |
("open_array", re.compile(r"\["), None), | |
("close_array", re.compile(r"\]"), None), | |
# Doxygen style comment | |
(None, re.compile(r"/\*[*!]"), "doc"), | |
# Comment | |
(None, re.compile(r"/\*.*?\*/", re.DOTALL), None), | |
(None, re.compile(r"//.*"), None), | |
# Preprocessor macro | |
(None, re.compile(r"#.*"), None), | |
# Operators (low prio due to comment start) | |
("operator", re.compile(r"[-+*/]"), None), | |
# Space | |
(None, re.compile(r"\s+"), None), | |
], | |
"doc": [ | |
# End of comment | |
(None, re.compile(r"\*/"), "out"), | |
# Start of description Doxygen style | |
(None, re.compile(r"(\\|@)brief"), "desc"), | |
# Garbage | |
(None, re.compile(r".", re.DOTALL), None), | |
], | |
"desc": [ | |
# Other Doxygen tag | |
(None, re.compile(r"(\\|@)\w+"), "doc"), | |
# End of comment | |
(None, re.compile(r"\*/"), "out"), | |
# Documentation data, secured not to match end of comment/Doxy-tag | |
("docdata", re.compile(r"[^\\@*]+"), None), | |
("docdata", re.compile(r".", re.DOTALL), None), | |
], | |
} | |
_re_spaces = re.compile(r"\s+") | |
def __init__(self): | |
self.lexer = None | |
self.reset() | |
# Public Interface | |
def reset(self): | |
self.token = None | |
self.fullmatch = None | |
self.groups = None | |
self.nfullmatch = None | |
self.ngroups = None | |
self.root = RootNode() | |
def get_ast(self): | |
""" | |
Gets the Abstract Syntax Tree as parsed by parse() | |
""" | |
return self.root | |
def lex(self, source, initpos = 0): | |
""" | |
Lexer function, usually used internally by the parser. | |
Yields tokens found while analysing ``source``. | |
""" | |
pos = initpos | |
end = len(source) | |
state = "out" | |
while pos < end: | |
for token_name, reg, state_chng in self._regexes[state]: | |
# Try to get a match | |
match = reg.match(source, pos) | |
if match: | |
# Advance according to how much was matched | |
pos = match.end() | |
# yield a token if it has a name | |
if token_name is not None: | |
# Yield token name, the full matched part of source | |
# and the match grouped according to (?P<tag>) tags | |
yield (token_name, match.group(), match.groupdict()) | |
# Switch state if requested | |
if state_chng is not None: | |
state = state_chng | |
break | |
else: | |
# the input file has an error in the syntax and lexer should | |
# yield an exception | |
raise RuntimeError("SyntaxError at position {}: '{}'".format( | |
pos, source[max(pos-5,0):min(pos+5,end)] | |
)) | |
def parse(self, source, initpos = 0): | |
""" | |
Parses ``source`` filling the internal tree. | |
When the function finishes, you can get the tree using ``get_ast``. | |
""" | |
self.lexer = self.lex(source, initpos) | |
self._lexnext() | |
self._nt_file(self.root) | |
# Helpers | |
def _lexnext(self): | |
# Lex the next token | |
# Previous match is saved. This is because the parser usually has one | |
# token look ahead, so it needs the data matched together with the | |
# previous token. | |
self.fullmatch, self.groups = self.nfullmatch, self.ngroups | |
try: | |
self.token, self.nfullmatch, self.ngroups = next(self.lexer) | |
except StopIteration: | |
# Special end-of-stream token when the lexer runs out of tokens | |
self.token, self.nfullmatch, self.ngroups = "<EOS>", None, None | |
def _fail(self): | |
# Break parsing in case of syntax error | |
raise RuntimeError( | |
"Unexpected symbol {} found ({})".format( | |
self.token, self.nfullmatch | |
) | |
) | |
def _accept(self, token): | |
# Accepts a token: optional symbol | |
if self.token == token: | |
self._lexnext() | |
return True | |
return False | |
def _expect(self, token): | |
# Expects a token: mandatory, it fails if the token is wrong | |
if self.token == token: | |
self._lexnext() | |
return True | |
self._fail() | |
# Non-Terminals | |
def _nt_file(self, parentnode): | |
# FILE = {STMT}, <EOS> ; | |
while not self._accept("<EOS>"): | |
self._nt_stmt(parentnode) | |
def _nt_stmt(self, parentnode): | |
# STMT = STRUCT | UNION | VARIABLE | DESCRIPTION | |
if self._accept("struct"): | |
self._nt_struct(parentnode) | |
elif self._accept("union"): | |
self._nt_union(parentnode) | |
elif self._accept("type"): | |
self._nt_variable(parentnode) | |
elif self._accept("docdata"): | |
self._nt_description(parentnode) | |
else: | |
self._fail() | |
def _nt_variable(self, parentnode, type_=None): | |
# VARIABLE = (((struct|union), identifier) | type), | |
# identifier, [ ("[", EXPR, "]") | (":", number) ], ";" ; | |
var = VariableNode() | |
parentnode.add_child(var) | |
if type_ is None: | |
type_ = self.fullmatch | |
var.type = type_ | |
self._expect("identifier") | |
var.name = self.fullmatch | |
var.array = False | |
if self._accept("bf_separator"): | |
self._expect("number") | |
var.bfsize = self.fullmatch | |
elif self._accept("open_array"): | |
self._nt_expr(var) | |
var.array = True | |
self._expect("close_array") | |
self._expect("semicolon") | |
def _nt_struct(self, parentnode): | |
# STRUCT = struct, [identifier], "{", {STMT}, "}", [identifier], ";"; | |
struct = StructNode() | |
if self._accept("identifier"): | |
struct.typename = self.fullmatch | |
if self.token == "identifier": | |
self._nt_variable(parentnode, type_="struct " + struct.typename) | |
return | |
else: | |
struct.typename = None | |
parentnode.add_child(struct) | |
self._expect("open_block") | |
while not self._accept("close_block"): | |
self._nt_stmt(struct) | |
if self._accept("identifier"): | |
struct.name = self.fullmatch | |
self._expect("semicolon") | |
def _nt_union(self, parentnode): | |
# UNION = union, [identifier], "{", {STMT}, "}", [identifier], ";"; | |
union = UnionNode() | |
if self._accept("identifier"): | |
union.typename = self.fullmatch | |
if self.token == "identifier": | |
self._nt_variable(parentnode, type_="union " + union.typename) | |
return | |
else: | |
union.typename = None | |
parentnode.add_child(union) | |
self._expect("open_block") | |
while not self._accept("close_block"): | |
self._nt_stmt(union) | |
if self._accept("identifier"): | |
union.name = self.fullmatch | |
self._expect("semicolon") | |
def _nt_description(self, parentnode): | |
# DESCRIPTION = {docdata}; | |
desc = DescriptionNode() | |
parentnode.add_child(desc) | |
desc.data = self.fullmatch.replace("\n", " ").strip() | |
while self._accept("docdata"): | |
desc.data += self.fullmatch.replace("\n", " ").strip() | |
desc.data = self._re_spaces.sub(" ", desc.data) | |
def _nt_expr(self, parentnode): | |
# EXPR = ( "(", EXPR, ")" ) | (identifier, [ operator, EXPR ]) ; | |
def sub_expr(node): | |
if self._accept("open_paren"): | |
node.expr += self.fullmatch | |
sub_expr(node) | |
self._expect("close_paren") | |
node.expr += self.fullmatch | |
elif self._accept("identifier"): | |
node.expr += self.fullmatch | |
if self._accept("operator"): | |
node.expr += self.fullmatch | |
sub_expr(node) | |
else: | |
self._fail() | |
par = False | |
node = ExprNode() | |
node.expr = "" | |
parentnode.add_child(node) | |
sub_expr(node) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment