Cilyan/parsesegment.py

## parsesegment.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re

class ParseSegment:
    # Dictionary of patterns per state
    # Tuples are (token name, pattern, state change command)
    _regexes = {
        "out": [
            ("open", re.compile(r"segment(?P<segment>\w+)\s+\{"), "in")
        ],
        "in": [
            ("close", re.compile(r"\}"), "out"),
            # Here an example of what you could want to match
            ("content", re.compile(r"content\s+(?P<content>\w+)"), None)
        ]
    }

    def lex(self, source, initpos = 0):
        pos = initpos
        end = len(source)
        state = "out"
        while pos < end:
            for token_name, reg, state_chng in self._regexes[state]:
                # Try to get a match
                match = reg.match(source, pos)
                if match:
                    # Advance according to how much was matched
                    pos = match.end()
                    # yield a token if it has a name
                    if token_name is not None:
                        # Yield token name, the full matched part of source
                        # and the match grouped according to (?P<tag>) tags
                        yield (token_name, match.group(), match.groupdict())
                    # Switch state if requested
                    if state_chng is not None:
                        state = state_chng
                    break
            else:
                # No match, advance by one character
                # This is particular to that lexer, usually no match means
                # the input file has an error in the syntax and lexer should
                # yield an exception
                pos += 1

    def parse(self, source, initpos = 0):
        # This is an example of use of the lexer with a parser
        # This converts the input file into a dictionary. Keys are segment
        # names, and values are list of contents.
        segments = {}
        cur_segment = None
        # Use lexer to get tokens from source
        for token, fullmatch, groups in self.lex(source, initpos):
            # On open, create the list of content in segments
            if token == "open":
                cur_segment = groups["segment"]
                segments[cur_segment] = []
            # On content, ensure we know the segment and add content to the
            # list
            elif token == "content":
                if cur_segment is None:
                    raise RuntimeError("Content found outside a segment")
                segments[cur_segment].append(groups["content"])
            # On close, set the current segment to unknown
            elif token == "close":
                cur_segment = None
            # ignore unknown tokens, we could raise an error instead
        return segments

def main():
    with open("...", "r") as fh:
        data = fh.read()
        lexer = ParseSegment()
        segments = lexer.parse(data)
        print(segments)
    return 0

if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import re

	class ParseSegment:
	# Dictionary of patterns per state
	# Tuples are (token name, pattern, state change command)
	_regexes = {
	"out": [
	("open", re.compile(r"segment(?P<segment>\w+)\s+\{"), "in")
	],
	"in": [
	("close", re.compile(r"\}"), "out"),
	# Here an example of what you could want to match
	("content", re.compile(r"content\s+(?P<content>\w+)"), None)
	]
	}

	def lex(self, source, initpos = 0):
	pos = initpos
	end = len(source)
	state = "out"
	while pos < end:
	for token_name, reg, state_chng in self._regexes[state]:
	# Try to get a match
	match = reg.match(source, pos)
	if match:
	# Advance according to how much was matched
	pos = match.end()
	# yield a token if it has a name
	if token_name is not None:
	# Yield token name, the full matched part of source
	# and the match grouped according to (?P<tag>) tags
	yield (token_name, match.group(), match.groupdict())
	# Switch state if requested
	if state_chng is not None:
	state = state_chng
	break
	else:
	# No match, advance by one character
	# This is particular to that lexer, usually no match means
	# the input file has an error in the syntax and lexer should
	# yield an exception
	pos += 1

	def parse(self, source, initpos = 0):
	# This is an example of use of the lexer with a parser
	# This converts the input file into a dictionary. Keys are segment
	# names, and values are list of contents.
	segments = {}
	cur_segment = None
	# Use lexer to get tokens from source
	for token, fullmatch, groups in self.lex(source, initpos):
	# On open, create the list of content in segments
	if token == "open":
	cur_segment = groups["segment"]
	segments[cur_segment] = []
	# On content, ensure we know the segment and add content to the
	# list
	elif token == "content":
	if cur_segment is None:
	raise RuntimeError("Content found outside a segment")
	segments[cur_segment].append(groups["content"])
	# On close, set the current segment to unknown
	elif token == "close":
	cur_segment = None
	# ignore unknown tokens, we could raise an error instead
	return segments

	def main():
	with open("...", "r") as fh:
	data = fh.read()
	lexer = ParseSegment()
	segments = lexer.parse(data)
	print(segments)
	return 0

	if __name__ == '__main__':
	main()