Created
June 16, 2023 19:52
-
-
Save mara004/d440e9dc084939bc28a64e1c426f4c36 to your computer and use it in GitHub Desktop.
Parser for a page text mini-language (WIP)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com> | |
# SPDX-License-Identifier: CC-BY-4.0 | |
# Parser for a page text mini-language | |
# Technically, this might be a use case for some parser generator, but for now | |
# it's implemented "manually" with common string operations | |
__all__ = ["parse_pagetext"] | |
try: | |
import asteval # optional, used to parse math expressions | |
aeval = asteval.Interpreter(minimal=True) | |
except ImportError: | |
def aeval(*args, **kwargs): | |
raise RuntimeError("Math evaluation requires asteval.") | |
SEP = "," | |
RANGE = "-" | |
STEP = "@" | |
BACKWARD = "b" | |
MULTIPLEX = "*" | |
OPEN = "(" | |
CLOSE = ")" | |
EXCLUDE = "/" | |
MATH = "m" | |
REVERSE = "r" | |
ODD = "odd" | |
EVEN = "even" | |
ALL = "all" | |
BEGIN = "begin" | |
END = "end" | |
def _split(pt_string): | |
splitted = [] | |
level = 0 | |
cursor = 0 | |
for i, c in enumerate(pt_string): | |
if c == OPEN: | |
level += 1 | |
elif c == CLOSE: | |
level -= 1 | |
elif level == 0 and c == SEP: | |
splitted.append( pt_string[cursor:i] ) | |
cursor = i + 1 | |
splitted.append( pt_string[cursor:] ) | |
assert level == 0 | |
return [p for p in splitted if p != ""] | |
def _apply_excludes(base, excludes): | |
not_found = [] | |
for v in excludes: | |
if v in base: | |
# in case of multiple occurrences, this removes the leftmost item | |
base.remove(v) | |
else: | |
not_found.append(v) | |
if len(not_found) > 0: | |
raise ValueError(f"Excessive excludes: {not_found}.") | |
def _get_group(t, sym): | |
if (t.startswith(sym+OPEN) and t.endswith(CLOSE)): | |
return t[2:-1] | |
elif (t.startswith(OPEN) and t.endswith(CLOSE+sym)): | |
return t[1:-2] | |
return None | |
def _get_expr(t, sym): | |
if t.startswith(sym): | |
return t[1:] | |
elif t.endswith(sym): | |
return t[:-1] | |
return None | |
def _parse(pt_string, doclen): | |
def _subparse(pt_item): | |
return _parse(pt_item, doclen) | |
def _to_num(t, req=True): | |
if t.isnumeric(): | |
return int(t) | |
elif (p := _get_expr(t, BACKWARD)) and p.isnumeric(): | |
return doclen+1 - int(p) | |
elif (g := _get_group(t, MATH)): | |
return int( aeval(g) ) | |
elif req: | |
raise ValueError(f"Could not interpret {t!r} as page number.") | |
else: | |
return None | |
page_nums = [] | |
for t in _split(pt_string): | |
if (p := _to_num(t, req=False)) is not None: | |
page_nums.append(p) | |
elif t == ODD: | |
page_nums.extend( range(1, doclen+1, 2) ) | |
elif t == EVEN: | |
page_nums.extend( range(2, doclen+1, 2) ) | |
elif t == ALL: | |
page_nums.extend( range(1, doclen+1) ) | |
elif (g := _get_group(t, BACKWARD)): | |
page_nums.extend( [doclen+1 - p for p in _subparse(g)] ) | |
elif (g := _get_group(t, REVERSE)): | |
page_nums.extend(reversed( _subparse(g) )) | |
# TODO operate only outside brackets | |
elif EXCLUDE+OPEN in t and t.endswith(CLOSE): | |
base, excludes = [_subparse(ps) for ps in t[:-1].split(EXCLUDE+OPEN, maxsplit=1)] | |
_apply_excludes(base, excludes) | |
page_nums.extend(base) | |
elif MULTIPLEX in t: | |
base, multiplexer = t.rsplit(MULTIPLEX, maxsplit=1) | |
page_nums.extend( _subparse(base) * _to_num(multiplexer) ) | |
elif RANGE in t: | |
step = 1 | |
start, stop = t.split(RANGE, maxsplit=1) | |
if STEP in stop: | |
stop, step = stop.split(STEP, maxsplit=1) | |
step = _to_num(step) | |
start = _to_num(start) | |
stop = doclen if stop == "" else _to_num(stop) | |
if start < stop: | |
page_nums.extend( range(start, stop+1, step) ) | |
else: | |
page_nums.extend( range(start, stop-1, -step) ) | |
# TODO handle unnecessary brackets | |
else: | |
raise ValueError(f"Unexpected token: {t!r}.") | |
return page_nums | |
def parse_pagetext(pt_string, doclen, as_indices=True): | |
# TODO add check step | |
pt_string = pt_string.replace(" ", "").lower().replace(BEGIN, "1").replace(END, str(doclen)) | |
pages = _parse(pt_string, doclen) | |
if as_indices: | |
pages = [p-1 for p in pages] | |
return pages |
Might want to re-implement this to allow for more flexible grouping.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Random examples showcasing some of the functionality: