Created
February 11, 2014 13:33
-
-
Save Cilyan/8934877 to your computer and use it in GitHub Desktop.
Simple lex/parse pattern with one state machine in lexer and another in parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
class ParseSegment: | |
# Dictionary of patterns per state | |
# Tuples are (token name, pattern, state change command) | |
_regexes = { | |
"out": [ | |
("open", re.compile(r"segment(?P<segment>\w+)\s+\{"), "in") | |
], | |
"in": [ | |
("close", re.compile(r"\}"), "out"), | |
# Here an example of what you could want to match | |
("content", re.compile(r"content\s+(?P<content>\w+)"), None) | |
] | |
} | |
def lex(self, source, initpos = 0): | |
pos = initpos | |
end = len(source) | |
state = "out" | |
while pos < end: | |
for token_name, reg, state_chng in self._regexes[state]: | |
# Try to get a match | |
match = reg.match(source, pos) | |
if match: | |
# Advance according to how much was matched | |
pos = match.end() | |
# yield a token if it has a name | |
if token_name is not None: | |
# Yield token name, the full matched part of source | |
# and the match grouped according to (?P<tag>) tags | |
yield (token_name, match.group(), match.groupdict()) | |
# Switch state if requested | |
if state_chng is not None: | |
state = state_chng | |
break | |
else: | |
# No match, advance by one character | |
# This is particular to that lexer, usually no match means | |
# the input file has an error in the syntax and lexer should | |
# yield an exception | |
pos += 1 | |
def parse(self, source, initpos = 0): | |
# This is an example of use of the lexer with a parser | |
# This converts the input file into a dictionary. Keys are segment | |
# names, and values are list of contents. | |
segments = {} | |
cur_segment = None | |
# Use lexer to get tokens from source | |
for token, fullmatch, groups in self.lex(source, initpos): | |
# On open, create the list of content in segments | |
if token == "open": | |
cur_segment = groups["segment"] | |
segments[cur_segment] = [] | |
# On content, ensure we know the segment and add content to the | |
# list | |
elif token == "content": | |
if cur_segment is None: | |
raise RuntimeError("Content found outside a segment") | |
segments[cur_segment].append(groups["content"]) | |
# On close, set the current segment to unknown | |
elif token == "close": | |
cur_segment = None | |
# ignore unknown tokens, we could raise an error instead | |
return segments | |
def main(): | |
with open("...", "r") as fh: | |
data = fh.read() | |
lexer = ParseSegment() | |
segments = lexer.parse(data) | |
print(segments) | |
return 0 | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment