CurtisFenner/pegcombinator.py

## pegcombinator.py
import re


class ParseResult:
    """
    Represents the result of parsing an input.
    `object` is the thing that was parsed from the beginning of the input.
    `rest` is everything that follows the `object` within the input.
    """

    def __init__(self, object, rest):
        self.object = object
        self.rest = rest

    def __str__(self):
        return "ParseResult(object=" + str(self.object) + ", rest=" + repr(self.rest) + ")"


class RegexParser:
    """
    RegexParser uses a regex to parse from an input.
    The object that is parsed is the regex groups, followed by the entire match,
    as a tuple.
    """

    def __init__(self, regex):
        self.regex = regex

    def parse(self, string: str, context) -> ParseResult:
        match = re.match(self.regex, string)
        if not match:
            return None
        return ParseResult(match.groups() + (match.group(),), string[match.end():])


class SeqParser:
    """
    SeqParser parses a sequence of several sub-parsers, stored in a dict.
    The parsed object is a dict, where each sub-parser's result was placed into
    the corresponding key in the output dict.
    """

    def __init__(self, parsers: dict):
        self.parsers = parsers

    def parse(self, string: str, context) -> ParseResult:
        record = {}
        rest = string
        for key, parser in self.parsers.items():
            sub_result = parser.parse(rest, context)
            if not sub_result:
                return None
            record[key] = sub_result.object
            rest = sub_result.rest
        return ParseResult(record, rest)


class ChoiceParser:
    """
    ChoiceParser parses one of several possible sub-parsers. ("ordered choice")
    The object from the first sub-parser to successfully parse is returned.
    """

    def __init__(self, *choices):
        self.choices = choices

    def parse(self, s, context) -> ParseResult:
        for choice in self.choices:
            sub_result = choice.parse(s, context)
            if sub_result:
                return sub_result
        return None


class NameParser:
    """
    NameParser refers to a parser defined within a containing `GrammarParser`.
    """

    def __init__(self, name):
        self.name = name

    def parse(self, s, context) -> ParseResult:
        return context["names"][self.name].parse(s, context)


class StarParser:
    """
    StarParser parses zero-or-more occurences of some sub-parser.
    ("Kleene-star")
    The list of parsed objects is returned.
    """

    def __init__(self, p):
        self.p = p

    def parse(self, string: str, context) -> ParseResult:
        objects = []
        rest = string
        while True:
            sub_result = self.p.parse(rest, context)
            if not sub_result:
                return ParseResult(objects, rest)
            objects.append(sub_result.object)
            rest = sub_result.rest


class GrammarParser:
    """
    GrammarParser allows recursive grammars to be created.
    The `grammar` argument is a dict of parsers. Each key of the dict can be
    referenced in a `NameParser`. This allows recursive parsing rules.

    GrammarParser adds a `"names"` key to the `context` dict. This is where the
    `NameParser` will look for definitions.
    """

    def __init__(self, initial, grammar: dict):
        self.initial = initial
        self.grammar = grammar

    def parse(self, string: str, context: dict):
        context_with_names_map = {**context, "names": self.grammar}
        return self.grammar[self.initial].parse(string, context_with_names_map)

################################################################################


# Define token parsers
intParser = RegexParser("[0-9]+")

opParser = RegexParser(r"\s*([/*+-])\s*")

openParser = RegexParser(r"\s*\(\s*")
closeParser = RegexParser(r"\s*\)\s*")

eofParser = RegexParser(r"$")

# Define grammar
grammar = GrammarParser("program", {
    "atom": ChoiceParser(intParser, NameParser("parened")),
    "parened": SeqParser({
        "_open": openParser,
        "e": NameParser("exp"),
        "_close": closeParser,
    }),
    "exp": SeqParser({
        "lhs": NameParser("atom"),
        "ops": StarParser(NameParser("operation")),
    }),
    "operation": SeqParser({"op": opParser, "rhs": NameParser("atom")}),
    "program": SeqParser({"e": NameParser("exp"),  "_": eofParser}),
})

# Parse an input
text = "   (1 + 2) * (3 + 4)   "
ast = grammar.parse(text, {})

print(ast)
	import re


	class ParseResult:
	"""
	Represents the result of parsing an input.
	`object` is the thing that was parsed from the beginning of the input.
	`rest` is everything that follows the `object` within the input.
	"""

	def __init__(self, object, rest):
	self.object = object
	self.rest = rest

	def __str__(self):
	return "ParseResult(object=" + str(self.object) + ", rest=" + repr(self.rest) + ")"


	class RegexParser:
	"""
	RegexParser uses a regex to parse from an input.
	The object that is parsed is the regex groups, followed by the entire match,
	as a tuple.
	"""

	def __init__(self, regex):
	self.regex = regex

	def parse(self, string: str, context) -> ParseResult:
	match = re.match(self.regex, string)
	if not match:
	return None
	return ParseResult(match.groups() + (match.group(),), string[match.end():])


	class SeqParser:
	"""
	SeqParser parses a sequence of several sub-parsers, stored in a dict.
	The parsed object is a dict, where each sub-parser's result was placed into
	the corresponding key in the output dict.
	"""

	def __init__(self, parsers: dict):
	self.parsers = parsers

	def parse(self, string: str, context) -> ParseResult:
	record = {}
	rest = string
	for key, parser in self.parsers.items():
	sub_result = parser.parse(rest, context)
	if not sub_result:
	return None
	record[key] = sub_result.object
	rest = sub_result.rest
	return ParseResult(record, rest)


	class ChoiceParser:
	"""
	ChoiceParser parses one of several possible sub-parsers. ("ordered choice")
	The object from the first sub-parser to successfully parse is returned.
	"""

	def __init__(self, *choices):
	self.choices = choices

	def parse(self, s, context) -> ParseResult:
	for choice in self.choices:
	sub_result = choice.parse(s, context)
	if sub_result:
	return sub_result
	return None


	class NameParser:
	"""
	NameParser refers to a parser defined within a containing `GrammarParser`.
	"""

	def __init__(self, name):
	self.name = name

	def parse(self, s, context) -> ParseResult:
	return context["names"][self.name].parse(s, context)


	class StarParser:
	"""
	StarParser parses zero-or-more occurences of some sub-parser.
	("Kleene-star")
	The list of parsed objects is returned.
	"""

	def __init__(self, p):
	self.p = p

	def parse(self, string: str, context) -> ParseResult:
	objects = []
	rest = string
	while True:
	sub_result = self.p.parse(rest, context)
	if not sub_result:
	return ParseResult(objects, rest)
	objects.append(sub_result.object)
	rest = sub_result.rest


	class GrammarParser:
	"""
	GrammarParser allows recursive grammars to be created.
	The `grammar` argument is a dict of parsers. Each key of the dict can be
	referenced in a `NameParser`. This allows recursive parsing rules.

	GrammarParser adds a `"names"` key to the `context` dict. This is where the
	`NameParser` will look for definitions.
	"""

	def __init__(self, initial, grammar: dict):
	self.initial = initial
	self.grammar = grammar

	def parse(self, string: str, context: dict):
	context_with_names_map = {**context, "names": self.grammar}
	return self.grammar[self.initial].parse(string, context_with_names_map)

	################################################################################


	# Define token parsers
	intParser = RegexParser("[0-9]+")

	opParser = RegexParser(r"\s([/+-])\s*")

	openParser = RegexParser(r"\s\(\s")
	closeParser = RegexParser(r"\s\)\s")

	eofParser = RegexParser(r"$")

	# Define grammar
	grammar = GrammarParser("program", {
	"atom": ChoiceParser(intParser, NameParser("parened")),
	"parened": SeqParser({
	"_open": openParser,
	"e": NameParser("exp"),
	"_close": closeParser,
	}),
	"exp": SeqParser({
	"lhs": NameParser("atom"),
	"ops": StarParser(NameParser("operation")),
	}),
	"operation": SeqParser({"op": opParser, "rhs": NameParser("atom")}),
	"program": SeqParser({"e": NameParser("exp"), "_": eofParser}),
	})

	# Parse an input
	text = " (1 + 2) * (3 + 4) "
	ast = grammar.parse(text, {})

	print(ast)