andrewcb/combinator_parsing.py

## combinator_parsing.py
# A lightweight approach to parsing text with combinators.
# put together in an afternoon for a project by acb, dedicated to the public domain
# Inspired by Haskell Parsec and the PointFree swift-parsing library

import re

# elementary parsing functions.
# These take an input string and return `(parsed_value, remainder)` if
# successful, or `None` if not.
# The functions below generate a parsing function when called.

def parse_literal(str):
    """Make a parser that accepts a literal string, consuming it and
    returning True"""
    def parse(inp):
        if inp.startswith(str):
            return (True, inp[len(str):])
    return parse


def parse_re(r, transform=None):
    """Make a parser that accepts a match to a regular expression.
    The regular expression may be supplied as a string or precompiled.
    The successful result by default is the text matching.
    If the transform parameter is provided, it should contain a function
    which accepts the re.Match object returned and returns the desired value.
    """
    if type(r) == str:
        r = re.compile(r)
    transform = transform or (lambda m:m.group(0))
    def parse(inp):
        if m:=r.match(inp):
            return (transform(m), inp[len(m.group(0)):])
    return parse

# A few simple use cases:

# Parse an integer value.
parse_int = parse_re(re.compile("[0-9]+"), lambda m: int(m.group(0)))
# Parse a word (i.e., an identifier or similar)
parse_word = parse_re(re.compile("\w+"), lambda m: m.group(0))


# combinators:

def parse_chain(*parsers, transform=None):
    """
    Make a parser that calls a list of parsers in succession on the same text,
    and if all succeed, return a list of result values (and the remainder).
    If transform is given, it is a function that converts the resulting list to
    an arbitrary result value.
    """
    def parse(inp):
        result = []
        for parser in parsers:
            if (parsed:=parser(inp)) is None:
                return None
            (r, inp) = parsed
            result.append(r)
        if transform:
            result = transform(*result)
        return (result, inp)
    return parse


def parse_one_of(*parsers):
    """
    Make a parser that attempts to apply one of a succession of parsers to
    a string, returning the result of the first one to succeed, or None.
    """
    def parse(inp):
        for parser in parsers:
            if parsed:=parser(inp):
                return parsed
        return None
    return parse

def parse_opt(parser):
    """
    Make a parser which will attempt to run a given subparser and return
    its value if successful, or None (as a successful parse value) if not.
    """
    def parse(inp):
        return parser(inp) or (None, inp)
    return parse


def parse_list(parser, separated_by=None):
    """
    Make a parser that applies a parser repeatedly to its input, and returns
    a list of the values parsed. A second subparser may be provided to handle
    separators; its result will be discarded.
    """
    def parse(inp):
        result = []
        while (r:=parser(inp)):
            result.append(r[0])
            inp = r[1]
            if separated_by:
                if (r:=separated_by(inp)):
                    inp = r[1]
                else:
                    return (result, inp)
        return (result, inp)
    return parse

# examples

parse_space = parse_re("\s+")

# a simple quoted string without the possibility of escaped quotes
parse_quotestr = parse_re('"[^"]*"', transform=lambda m:m.group(0)[1:-1])

# a value that's either an integer or a quoted string
parse_val = parse_one_of(parse_int, parse_quotestr)

# a key=value assignment
parse_keyval = parse_chain(parse_word, parse_literal("="), parse_val, transform=lambda a,b,c:(a,c))

# parsing a simple HTML (opening) tag; i.e., `<foo bar=aaa baz>

# parse an attribute in the tag. An attribute may be of the form `key` or `key=val`. In the former case,
# the value is returned as `True`
parse_html_attr = parse_one_of(
    parse_keyval,
    parse_chain(parse_word, transform=lambda k:(k,True))
)

parse_html_tag = parse_chain(
    parse_literal("<"),
    parse_word,
    parse_opt(
        parse_chain(
            parse_space,
            parse_list(parse_html_attr, separated_by=parse_space),
            transform=lambda a,b:b
        )
    ),
    parse_literal(">"),
    transform=lambda a,b,c,d:(b,c)
)

# >>> parse_html_tag('<foo bar="a" count=23 flag>The rest of the input')
# (('foo', [('bar', 'a'), ('count', 23), ('flag', True)]), 'The rest of the input')
	# A lightweight approach to parsing text with combinators.
	# put together in an afternoon for a project by acb, dedicated to the public domain
	# Inspired by Haskell Parsec and the PointFree swift-parsing library

	import re

	# elementary parsing functions.
	# These take an input string and return `(parsed_value, remainder)` if
	# successful, or `None` if not.
	# The functions below generate a parsing function when called.

	def parse_literal(str):
	"""Make a parser that accepts a literal string, consuming it and
	returning True"""
	def parse(inp):
	if inp.startswith(str):
	return (True, inp[len(str):])
	return parse


	def parse_re(r, transform=None):
	"""Make a parser that accepts a match to a regular expression.
	The regular expression may be supplied as a string or precompiled.
	The successful result by default is the text matching.
	If the transform parameter is provided, it should contain a function
	which accepts the re.Match object returned and returns the desired value.
	"""
	if type(r) == str:
	r = re.compile(r)
	transform = transform or (lambda m:m.group(0))
	def parse(inp):
	if m:=r.match(inp):
	return (transform(m), inp[len(m.group(0)):])
	return parse

	# A few simple use cases:

	# Parse an integer value.
	parse_int = parse_re(re.compile("[0-9]+"), lambda m: int(m.group(0)))
	# Parse a word (i.e., an identifier or similar)
	parse_word = parse_re(re.compile("\w+"), lambda m: m.group(0))



	# combinators:

	def parse_chain(*parsers, transform=None):
	"""
	Make a parser that calls a list of parsers in succession on the same text,
	and if all succeed, return a list of result values (and the remainder).
	If transform is given, it is a function that converts the resulting list to
	an arbitrary result value.
	"""
	def parse(inp):
	result = []
	for parser in parsers:
	if (parsed:=parser(inp)) is None:
	return None
	(r, inp) = parsed
	result.append(r)
	if transform:
	result = transform(*result)
	return (result, inp)
	return parse


	def parse_one_of(*parsers):
	"""
	Make a parser that attempts to apply one of a succession of parsers to
	a string, returning the result of the first one to succeed, or None.
	"""
	def parse(inp):
	for parser in parsers:
	if parsed:=parser(inp):
	return parsed
	return None
	return parse

	def parse_opt(parser):
	"""
	Make a parser which will attempt to run a given subparser and return
	its value if successful, or None (as a successful parse value) if not.
	"""
	def parse(inp):
	return parser(inp) or (None, inp)
	return parse


	def parse_list(parser, separated_by=None):
	"""
	Make a parser that applies a parser repeatedly to its input, and returns
	a list of the values parsed. A second subparser may be provided to handle
	separators; its result will be discarded.
	"""
	def parse(inp):
	result = []
	while (r:=parser(inp)):
	result.append(r[0])
	inp = r[1]
	if separated_by:
	if (r:=separated_by(inp)):
	inp = r[1]
	else:
	return (result, inp)
	return (result, inp)
	return parse

	# examples

	parse_space = parse_re("\s+")

	# a simple quoted string without the possibility of escaped quotes
	parse_quotestr = parse_re('"[^"]*"', transform=lambda m:m.group(0)[1:-1])

	# a value that's either an integer or a quoted string
	parse_val = parse_one_of(parse_int, parse_quotestr)

	# a key=value assignment
	parse_keyval = parse_chain(parse_word, parse_literal("="), parse_val, transform=lambda a,b,c:(a,c))

	# parsing a simple HTML (opening) tag; i.e., `<foo bar=aaa baz>

	# parse an attribute in the tag. An attribute may be of the form `key` or `key=val`. In the former case,
	# the value is returned as `True`
	parse_html_attr = parse_one_of(
	parse_keyval,
	parse_chain(parse_word, transform=lambda k:(k,True))
	)

	parse_html_tag = parse_chain(
	parse_literal("<"),
	parse_word,
	parse_opt(
	parse_chain(
	parse_space,
	parse_list(parse_html_attr, separated_by=parse_space),
	transform=lambda a,b:b
	)
	),
	parse_literal(">"),
	transform=lambda a,b,c,d:(b,c)
	)

	# >>> parse_html_tag('<foo bar="a" count=23 flag>The rest of the input')
	# (('foo', [('bar', 'a'), ('count', 23), ('flag', True)]), 'The rest of the input')