Created
December 25, 2023 01:11
-
-
Save andrewcb/e529ec235af29c39b48a1de8699dc4f1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A lightweight approach to parsing text with combinators. | |
# put together in an afternoon for a project by acb, dedicated to the public domain | |
# Inspired by Haskell Parsec and the PointFree swift-parsing library | |
import re | |
# elementary parsing functions. | |
# These take an input string and return `(parsed_value, remainder)` if | |
# successful, or `None` if not. | |
# The functions below generate a parsing function when called. | |
def parse_literal(str): | |
"""Make a parser that accepts a literal string, consuming it and | |
returning True""" | |
def parse(inp): | |
if inp.startswith(str): | |
return (True, inp[len(str):]) | |
return parse | |
def parse_re(r, transform=None): | |
"""Make a parser that accepts a match to a regular expression. | |
The regular expression may be supplied as a string or precompiled. | |
The successful result by default is the text matching. | |
If the transform parameter is provided, it should contain a function | |
which accepts the re.Match object returned and returns the desired value. | |
""" | |
if type(r) == str: | |
r = re.compile(r) | |
transform = transform or (lambda m:m.group(0)) | |
def parse(inp): | |
if m:=r.match(inp): | |
return (transform(m), inp[len(m.group(0)):]) | |
return parse | |
# A few simple use cases: | |
# Parse an integer value. | |
parse_int = parse_re(re.compile("[0-9]+"), lambda m: int(m.group(0))) | |
# Parse a word (i.e., an identifier or similar) | |
parse_word = parse_re(re.compile("\w+"), lambda m: m.group(0)) | |
# combinators: | |
def parse_chain(*parsers, transform=None): | |
""" | |
Make a parser that calls a list of parsers in succession on the same text, | |
and if all succeed, return a list of result values (and the remainder). | |
If transform is given, it is a function that converts the resulting list to | |
an arbitrary result value. | |
""" | |
def parse(inp): | |
result = [] | |
for parser in parsers: | |
if (parsed:=parser(inp)) is None: | |
return None | |
(r, inp) = parsed | |
result.append(r) | |
if transform: | |
result = transform(*result) | |
return (result, inp) | |
return parse | |
def parse_one_of(*parsers): | |
""" | |
Make a parser that attempts to apply one of a succession of parsers to | |
a string, returning the result of the first one to succeed, or None. | |
""" | |
def parse(inp): | |
for parser in parsers: | |
if parsed:=parser(inp): | |
return parsed | |
return None | |
return parse | |
def parse_opt(parser): | |
""" | |
Make a parser which will attempt to run a given subparser and return | |
its value if successful, or None (as a successful parse value) if not. | |
""" | |
def parse(inp): | |
return parser(inp) or (None, inp) | |
return parse | |
def parse_list(parser, separated_by=None): | |
""" | |
Make a parser that applies a parser repeatedly to its input, and returns | |
a list of the values parsed. A second subparser may be provided to handle | |
separators; its result will be discarded. | |
""" | |
def parse(inp): | |
result = [] | |
while (r:=parser(inp)): | |
result.append(r[0]) | |
inp = r[1] | |
if separated_by: | |
if (r:=separated_by(inp)): | |
inp = r[1] | |
else: | |
return (result, inp) | |
return (result, inp) | |
return parse | |
# examples | |
parse_space = parse_re("\s+") | |
# a simple quoted string without the possibility of escaped quotes | |
parse_quotestr = parse_re('"[^"]*"', transform=lambda m:m.group(0)[1:-1]) | |
# a value that's either an integer or a quoted string | |
parse_val = parse_one_of(parse_int, parse_quotestr) | |
# a key=value assignment | |
parse_keyval = parse_chain(parse_word, parse_literal("="), parse_val, transform=lambda a,b,c:(a,c)) | |
# parsing a simple HTML (opening) tag; i.e., `<foo bar=aaa baz> | |
# parse an attribute in the tag. An attribute may be of the form `key` or `key=val`. In the former case, | |
# the value is returned as `True` | |
parse_html_attr = parse_one_of( | |
parse_keyval, | |
parse_chain(parse_word, transform=lambda k:(k,True)) | |
) | |
parse_html_tag = parse_chain( | |
parse_literal("<"), | |
parse_word, | |
parse_opt( | |
parse_chain( | |
parse_space, | |
parse_list(parse_html_attr, separated_by=parse_space), | |
transform=lambda a,b:b | |
) | |
), | |
parse_literal(">"), | |
transform=lambda a,b,c,d:(b,c) | |
) | |
# >>> parse_html_tag('<foo bar="a" count=23 flag>The rest of the input') | |
# (('foo', [('bar', 'a'), ('count', 23), ('flag', True)]), 'The rest of the input') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment