Skip to content

Instantly share code, notes, and snippets.

@mara004
Created June 16, 2023 19:52
Show Gist options
  • Save mara004/d440e9dc084939bc28a64e1c426f4c36 to your computer and use it in GitHub Desktop.
Save mara004/d440e9dc084939bc28a64e1c426f4c36 to your computer and use it in GitHub Desktop.
Parser for a page text mini-language (WIP)
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: CC-BY-4.0
# Parser for a page text mini-language
# Technically, this might be a use case for some parser generator, but for now
# it's implemented "manually" with common string operations
__all__ = ["parse_pagetext"]
try:
import asteval # optional, used to parse math expressions
aeval = asteval.Interpreter(minimal=True)
except ImportError:
def aeval(*args, **kwargs):
raise RuntimeError("Math evaluation requires asteval.")
SEP = ","
RANGE = "-"
STEP = "@"
BACKWARD = "b"
MULTIPLEX = "*"
OPEN = "("
CLOSE = ")"
EXCLUDE = "/"
MATH = "m"
REVERSE = "r"
ODD = "odd"
EVEN = "even"
ALL = "all"
BEGIN = "begin"
END = "end"
def _split(pt_string):
splitted = []
level = 0
cursor = 0
for i, c in enumerate(pt_string):
if c == OPEN:
level += 1
elif c == CLOSE:
level -= 1
elif level == 0 and c == SEP:
splitted.append( pt_string[cursor:i] )
cursor = i + 1
splitted.append( pt_string[cursor:] )
assert level == 0
return [p for p in splitted if p != ""]
def _apply_excludes(base, excludes):
not_found = []
for v in excludes:
if v in base:
# in case of multiple occurrences, this removes the leftmost item
base.remove(v)
else:
not_found.append(v)
if len(not_found) > 0:
raise ValueError(f"Excessive excludes: {not_found}.")
def _get_group(t, sym):
if (t.startswith(sym+OPEN) and t.endswith(CLOSE)):
return t[2:-1]
elif (t.startswith(OPEN) and t.endswith(CLOSE+sym)):
return t[1:-2]
return None
def _get_expr(t, sym):
if t.startswith(sym):
return t[1:]
elif t.endswith(sym):
return t[:-1]
return None
def _parse(pt_string, doclen):
def _subparse(pt_item):
return _parse(pt_item, doclen)
def _to_num(t, req=True):
if t.isnumeric():
return int(t)
elif (p := _get_expr(t, BACKWARD)) and p.isnumeric():
return doclen+1 - int(p)
elif (g := _get_group(t, MATH)):
return int( aeval(g) )
elif req:
raise ValueError(f"Could not interpret {t!r} as page number.")
else:
return None
page_nums = []
for t in _split(pt_string):
if (p := _to_num(t, req=False)) is not None:
page_nums.append(p)
elif t == ODD:
page_nums.extend( range(1, doclen+1, 2) )
elif t == EVEN:
page_nums.extend( range(2, doclen+1, 2) )
elif t == ALL:
page_nums.extend( range(1, doclen+1) )
elif (g := _get_group(t, BACKWARD)):
page_nums.extend( [doclen+1 - p for p in _subparse(g)] )
elif (g := _get_group(t, REVERSE)):
page_nums.extend(reversed( _subparse(g) ))
# TODO operate only outside brackets
elif EXCLUDE+OPEN in t and t.endswith(CLOSE):
base, excludes = [_subparse(ps) for ps in t[:-1].split(EXCLUDE+OPEN, maxsplit=1)]
_apply_excludes(base, excludes)
page_nums.extend(base)
elif MULTIPLEX in t:
base, multiplexer = t.rsplit(MULTIPLEX, maxsplit=1)
page_nums.extend( _subparse(base) * _to_num(multiplexer) )
elif RANGE in t:
step = 1
start, stop = t.split(RANGE, maxsplit=1)
if STEP in stop:
stop, step = stop.split(STEP, maxsplit=1)
step = _to_num(step)
start = _to_num(start)
stop = doclen if stop == "" else _to_num(stop)
if start < stop:
page_nums.extend( range(start, stop+1, step) )
else:
page_nums.extend( range(start, stop-1, -step) )
# TODO handle unnecessary brackets
else:
raise ValueError(f"Unexpected token: {t!r}.")
return page_nums
def parse_pagetext(pt_string, doclen, as_indices=True):
# TODO add check step
pt_string = pt_string.replace(" ", "").lower().replace(BEGIN, "1").replace(END, str(doclen))
pages = _parse(pt_string, doclen)
if as_indices:
pages = [p-1 for p in pages]
return pages
@mara004
Copy link
Author

mara004 commented Jun 16, 2023

Random examples showcasing some of the functionality:

>>> parse_pagetext("1, 3-5, 7-20/(10-13, 15, m(end-4)), 5-3, r(all/(6-end), 1)", 20)
[1, 3, 4, 5, 7, 8, 9, 14, 17, 18, 19, 20, 5, 4, 3, 1, 5, 4, 3, 2, 1]
>>> parse_pagetext("begin, end, odd, even, all", 5)
[1, 5, 1, 3, 5, 2, 4, 1, 2, 3, 4, 5]
>>> parse_pagetext("1-8@2, 8-1@2", 8)
[1, 3, 5, 7, 8, 6, 4, 2]
>>> parse_pagetext("1-2*3/(2)", 2)
[1, 1, 2, 1, 2]

@mara004
Copy link
Author

mara004 commented Jun 16, 2023

Might want to re-implement this to allow for more flexible grouping.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment