Skip to content

Instantly share code, notes, and snippets.

@andrewcb
Created December 25, 2023 01:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andrewcb/e529ec235af29c39b48a1de8699dc4f1 to your computer and use it in GitHub Desktop.
Save andrewcb/e529ec235af29c39b48a1de8699dc4f1 to your computer and use it in GitHub Desktop.
# A lightweight approach to parsing text with combinators.
# put together in an afternoon for a project by acb, dedicated to the public domain
# Inspired by Haskell Parsec and the PointFree swift-parsing library
import re
# elementary parsing functions.
# These take an input string and return `(parsed_value, remainder)` if
# successful, or `None` if not.
# The functions below generate a parsing function when called.
def parse_literal(str):
"""Make a parser that accepts a literal string, consuming it and
returning True"""
def parse(inp):
if inp.startswith(str):
return (True, inp[len(str):])
return parse
def parse_re(r, transform=None):
"""Make a parser that accepts a match to a regular expression.
The regular expression may be supplied as a string or precompiled.
The successful result by default is the text matching.
If the transform parameter is provided, it should contain a function
which accepts the re.Match object returned and returns the desired value.
"""
if type(r) == str:
r = re.compile(r)
transform = transform or (lambda m:m.group(0))
def parse(inp):
if m:=r.match(inp):
return (transform(m), inp[len(m.group(0)):])
return parse
# A few simple use cases:
# Parse an integer value.
parse_int = parse_re(re.compile("[0-9]+"), lambda m: int(m.group(0)))
# Parse a word (i.e., an identifier or similar)
parse_word = parse_re(re.compile("\w+"), lambda m: m.group(0))
# combinators:
def parse_chain(*parsers, transform=None):
"""
Make a parser that calls a list of parsers in succession on the same text,
and if all succeed, return a list of result values (and the remainder).
If transform is given, it is a function that converts the resulting list to
an arbitrary result value.
"""
def parse(inp):
result = []
for parser in parsers:
if (parsed:=parser(inp)) is None:
return None
(r, inp) = parsed
result.append(r)
if transform:
result = transform(*result)
return (result, inp)
return parse
def parse_one_of(*parsers):
"""
Make a parser that attempts to apply one of a succession of parsers to
a string, returning the result of the first one to succeed, or None.
"""
def parse(inp):
for parser in parsers:
if parsed:=parser(inp):
return parsed
return None
return parse
def parse_opt(parser):
"""
Make a parser which will attempt to run a given subparser and return
its value if successful, or None (as a successful parse value) if not.
"""
def parse(inp):
return parser(inp) or (None, inp)
return parse
def parse_list(parser, separated_by=None):
"""
Make a parser that applies a parser repeatedly to its input, and returns
a list of the values parsed. A second subparser may be provided to handle
separators; its result will be discarded.
"""
def parse(inp):
result = []
while (r:=parser(inp)):
result.append(r[0])
inp = r[1]
if separated_by:
if (r:=separated_by(inp)):
inp = r[1]
else:
return (result, inp)
return (result, inp)
return parse
# examples
parse_space = parse_re("\s+")
# a simple quoted string without the possibility of escaped quotes
parse_quotestr = parse_re('"[^"]*"', transform=lambda m:m.group(0)[1:-1])
# a value that's either an integer or a quoted string
parse_val = parse_one_of(parse_int, parse_quotestr)
# a key=value assignment
parse_keyval = parse_chain(parse_word, parse_literal("="), parse_val, transform=lambda a,b,c:(a,c))
# parsing a simple HTML (opening) tag; i.e., `<foo bar=aaa baz>
# parse an attribute in the tag. An attribute may be of the form `key` or `key=val`. In the former case,
# the value is returned as `True`
parse_html_attr = parse_one_of(
parse_keyval,
parse_chain(parse_word, transform=lambda k:(k,True))
)
parse_html_tag = parse_chain(
parse_literal("<"),
parse_word,
parse_opt(
parse_chain(
parse_space,
parse_list(parse_html_attr, separated_by=parse_space),
transform=lambda a,b:b
)
),
parse_literal(">"),
transform=lambda a,b,c,d:(b,c)
)
# >>> parse_html_tag('<foo bar="a" count=23 flag>The rest of the input')
# (('foo', [('bar', 'a'), ('count', 23), ('flag', True)]), 'The rest of the input')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment