Last active
August 29, 2015 14:22
-
-
Save Cilyan/71dc1a7751142eb5f205 to your computer and use it in GitHub Desktop.
Example parser using a stateful lexer and a recursive descent parser. Goal is to parse a C file containing structures, unions and variables (no functions) and some doxygen comments, building an AST. (Originally part of a bigger project.)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Nodes for the Abstract Syntax Tree | |
class Node(object): | |
def __init__(self): | |
self.children = [] | |
self.parent = None | |
for attr in self.__attributes__: | |
setattr(self, attr, None) | |
def __repr__(self): | |
main_attr = self.__attributes__[0] | |
return "<{class_} {attr_name}={attr_value}".format( | |
class_ = self.__class__.__name__, | |
attr_name = main_attr, | |
attr_value = getattr(self, main_attr) | |
) | |
def add_child(self, child_node): | |
self.children.append(child_node) | |
child_node.set_parent(self) | |
def set_parent(self, parent): | |
self.parent = parent | |
def print_tree(self, stream=sys.stdout, print_attrs = True, indent = 0): | |
stream.write(" " * indent) | |
stream.write("<{class_}".format(class_ = self.__class__.__name__)) | |
for attr in self.__attributes__: | |
stream.write(" {attr_name}={attr_value}".format( | |
attr_name = attr, | |
attr_value = getattr(self, attr) | |
)) | |
if not print_attrs: | |
# Trick to only print first attribute if print_attrs is not set | |
break | |
stream.write(">\n") | |
for child in self.children: | |
child.print_tree(stream, print_attrs, indent + 1) | |
class RootNode(Node): | |
__attributes__ = [] | |
class StructNode(Node): | |
__attributes__ = ["typename", "name"] | |
class VariableNode(Node): | |
__attributes__ = ["name", "type", "description", "bfsize", "array"] | |
class UnionNode(Node): | |
__attributes__ = ["typename", "name"] | |
class DescriptionNode(Node): | |
__attributes__ = ["data"] | |
class ExprNode(Node): | |
__attributes__ = ["expr"] | |
class InterfaceParser: | |
""" | |
Parser that reads a C interface file containing structures, unions | |
and variables and builds an abstract syntax tree that can be then | |
analysed. | |
""" | |
# Stateful lexer with recursive descent parser | |
_regexes = { | |
"out": [ | |
# C Code | |
("type", re.compile( | |
r"(SWORD|UWORD|SBYTE|UBYTE|SLONG|ULONG|BITFIELD(8|16|32))" | |
), None), | |
("struct", re.compile("struct"), None), | |
("union", re.compile("union"), None), | |
("number", re.compile(r"\d+"), None), | |
("identifier", re.compile(r"\w+"), None), | |
("bf_separator", re.compile(r":"), None), | |
("semicolon", re.compile(r";"), None), | |
# Parenthesis | |
("open_paren", re.compile(r"\("), None), | |
("close_paren", re.compile(r"\)"), None), | |
# Blocks | |
("open_block", re.compile(r"\{"), None), | |
("close_block", re.compile(r"\}"), None), | |
# Arrays | |
("open_array", re.compile(r"\["), None), | |
("close_array", re.compile(r"\]"), None), | |
# Doxygen style comment | |
(None, re.compile(r"/\*[*!]"), "doc"), | |
# Comment | |
(None, re.compile(r"/\*.*?\*/", re.DOTALL), None), | |
(None, re.compile(r"//.*"), None), | |
# Preprocessor macro | |
(None, re.compile(r"#.*"), None), | |
# Operators (low prio due to comment start) | |
("operator", re.compile(r"[-+*/]"), None), | |
# Space | |
(None, re.compile(r"\s+"), None), | |
], | |
"doc": [ | |
# End of comment | |
(None, re.compile(r"\*/"), "out"), | |
# Start of description Doxygen style | |
(None, re.compile(r"(\\|@)brief"), "desc"), | |
# Garbage | |
(None, re.compile(r".", re.DOTALL), None), | |
], | |
"desc": [ | |
# Other Doxygen tag | |
(None, re.compile(r"(\\|@)\w+"), "doc"), | |
# End of comment | |
(None, re.compile(r"\*/"), "out"), | |
# Documentation data, secured not to match end of comment/Doxy-tag | |
("docdata", re.compile(r"[^\\@*]+"), None), | |
("docdata", re.compile(r".", re.DOTALL), None), | |
], | |
} | |
_re_spaces = re.compile(r"\s+") | |
def __init__(self): | |
self.lexer = None | |
self.reset() | |
# Public Interface | |
def reset(self): | |
self.token = None | |
self.fullmatch = None | |
self.groups = None | |
self.nfullmatch = None | |
self.ngroups = None | |
self.root = RootNode() | |
def get_ast(self): | |
""" | |
Gets the Abstract Syntax Tree as parsed by parse() | |
""" | |
return self.root | |
def lex(self, source, initpos = 0): | |
""" | |
Lexer function, usually used internally by the parser. | |
Yields tokens found while analysing ``source``. | |
""" | |
pos = initpos | |
end = len(source) | |
state = "out" | |
while pos < end: | |
for token_name, reg, state_chng in self._regexes[state]: | |
# Try to get a match | |
match = reg.match(source, pos) | |
if match: | |
# Advance according to how much was matched | |
pos = match.end() | |
# yield a token if it has a name | |
if token_name is not None: | |
# Yield token name, the full matched part of source | |
# and the match grouped according to (?P<tag>) tags | |
yield (token_name, match.group(), match.groupdict()) | |
# Switch state if requested | |
if state_chng is not None: | |
state = state_chng | |
break | |
else: | |
# the input file has an error in the syntax and lexer should | |
# yield an exception | |
raise RuntimeError("SyntaxError at position {}: '{}'".format( | |
pos, source[max(pos-5,0):min(pos+5,end)] | |
)) | |
def parse(self, source, initpos = 0): | |
""" | |
Parses ``source`` filling the internal tree. | |
When the function finishes, you can get the tree using ``get_ast``. | |
""" | |
self.lexer = self.lex(source, initpos) | |
self._lexnext() | |
self._nt_file(self.root) | |
# Helpers | |
def _lexnext(self): | |
# Lex the next token | |
# Previous match is saved. This is because the parser usually has one | |
# token look ahead, so it needs the data matched together with the | |
# previous token. | |
self.fullmatch, self.groups = self.nfullmatch, self.ngroups | |
try: | |
self.token, self.nfullmatch, self.ngroups = next(self.lexer) | |
except StopIteration: | |
# Special end-of-stream token when the lexer runs out of tokens | |
self.token, self.nfullmatch, self.ngroups = "<EOS>", None, None | |
def _fail(self): | |
# Break parsing in case of syntax error | |
raise RuntimeError( | |
"Unexpected symbol {} found ({})".format( | |
self.token, self.nfullmatch | |
) | |
) | |
def _accept(self, token): | |
# Accepts a token: optional symbol | |
if self.token == token: | |
self._lexnext() | |
return True | |
return False | |
def _expect(self, token): | |
# Expects a token: mandatory, it fails if the token is wrong | |
if self.token == token: | |
self._lexnext() | |
return True | |
self._fail() | |
# Non-Terminals | |
def _nt_file(self, parentnode): | |
# FILE = {STMT}, <EOS> ; | |
while not self._accept("<EOS>"): | |
self._nt_stmt(parentnode) | |
def _nt_stmt(self, parentnode): | |
# STMT = STRUCT | UNION | VARIABLE | DESCRIPTION | |
if self._accept("struct"): | |
self._nt_struct(parentnode) | |
elif self._accept("union"): | |
self._nt_union(parentnode) | |
elif self._accept("type"): | |
self._nt_variable(parentnode) | |
elif self._accept("docdata"): | |
self._nt_description(parentnode) | |
else: | |
self._fail() | |
def _nt_variable(self, parentnode, type_=None): | |
# VARIABLE = (((struct|union), identifier) | type), | |
# identifier, [ ("[", EXPR, "]") | (":", number) ], ";" ; | |
var = VariableNode() | |
parentnode.add_child(var) | |
if type_ is None: | |
type_ = self.fullmatch | |
var.type = type_ | |
self._expect("identifier") | |
var.name = self.fullmatch | |
var.array = False | |
if self._accept("bf_separator"): | |
self._expect("number") | |
var.bfsize = self.fullmatch | |
elif self._accept("open_array"): | |
self._nt_expr(var) | |
var.array = True | |
self._expect("close_array") | |
self._expect("semicolon") | |
def _nt_struct(self, parentnode): | |
# STRUCT = struct, [identifier], "{", {STMT}, "}", [identifier], ";"; | |
struct = StructNode() | |
if self._accept("identifier"): | |
struct.typename = self.fullmatch | |
if self.token == "identifier": | |
self._nt_variable(parentnode, type_="struct " + struct.typename) | |
return | |
else: | |
struct.typename = None | |
parentnode.add_child(struct) | |
self._expect("open_block") | |
while not self._accept("close_block"): | |
self._nt_stmt(struct) | |
if self._accept("identifier"): | |
struct.name = self.fullmatch | |
self._expect("semicolon") | |
def _nt_union(self, parentnode): | |
# UNION = union, [identifier], "{", {STMT}, "}", [identifier], ";"; | |
union = UnionNode() | |
if self._accept("identifier"): | |
union.typename = self.fullmatch | |
if self.token == "identifier": | |
self._nt_variable(parentnode, type_="union " + union.typename) | |
return | |
else: | |
union.typename = None | |
parentnode.add_child(union) | |
self._expect("open_block") | |
while not self._accept("close_block"): | |
self._nt_stmt(union) | |
if self._accept("identifier"): | |
union.name = self.fullmatch | |
self._expect("semicolon") | |
def _nt_description(self, parentnode): | |
# DESCRIPTION = {docdata}; | |
desc = DescriptionNode() | |
parentnode.add_child(desc) | |
desc.data = self.fullmatch.replace("\n", " ").strip() | |
while self._accept("docdata"): | |
desc.data += self.fullmatch.replace("\n", " ").strip() | |
desc.data = self._re_spaces.sub(" ", desc.data) | |
def _nt_expr(self, parentnode): | |
# EXPR = ( "(", EXPR, ")" ) | (identifier, [ operator, EXPR ]) ; | |
def sub_expr(node): | |
if self._accept("open_paren"): | |
node.expr += self.fullmatch | |
sub_expr(node) | |
self._expect("close_paren") | |
node.expr += self.fullmatch | |
elif self._accept("identifier"): | |
node.expr += self.fullmatch | |
if self._accept("operator"): | |
node.expr += self.fullmatch | |
sub_expr(node) | |
else: | |
self._fail() | |
par = False | |
node = ExprNode() | |
node.expr = "" | |
parentnode.add_child(node) | |
sub_expr(node) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment