mara004/ptp.py

## ptp.py
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: CC-BY-4.0

# Parser for a page text mini-language
# Technically, this might be a use case for some parser generator, but for now
# it's implemented "manually" with common string operations

__all__ = ["parse_pagetext"]

try:
    import asteval  # optional, used to parse math expressions
    aeval = asteval.Interpreter(minimal=True)
except ImportError:
    def aeval(*args, **kwargs):
        raise RuntimeError("Math evaluation requires asteval.")

SEP       = ","
RANGE     = "-"
STEP      = "@"
BACKWARD  = "b"
MULTIPLEX = "*"
OPEN      = "("
CLOSE     = ")"
EXCLUDE   = "/"
MATH      = "m"
REVERSE   = "r"
ODD       = "odd"
EVEN      = "even"
ALL       = "all"
BEGIN     = "begin"
END       = "end"


def _split(pt_string):

    splitted = []
    level = 0
    cursor = 0

    for i, c in enumerate(pt_string):
        if c == OPEN:
            level += 1
        elif c == CLOSE:
            level -= 1
        elif level == 0 and c == SEP:
            splitted.append( pt_string[cursor:i] )
            cursor = i + 1

    splitted.append( pt_string[cursor:] )
    assert level == 0

    return [p for p in splitted if p != ""]


def _apply_excludes(base, excludes):

    not_found = []
    for v in excludes:
        if v in base:
            # in case of multiple occurrences, this removes the leftmost item
            base.remove(v)
        else:
            not_found.append(v)

    if len(not_found) > 0:
        raise ValueError(f"Excessive excludes: {not_found}.")


def _get_group(t, sym):
    if (t.startswith(sym+OPEN) and t.endswith(CLOSE)):
        return t[2:-1]
    elif (t.startswith(OPEN) and t.endswith(CLOSE+sym)):
        return t[1:-2]
    return None


def _get_expr(t, sym):
    if t.startswith(sym):
        return t[1:]
    elif t.endswith(sym):
        return t[:-1]
    return None


def _parse(pt_string, doclen):

    def _subparse(pt_item):
        return _parse(pt_item, doclen)

    def _to_num(t, req=True):
        if t.isnumeric():
            return int(t)
        elif (p := _get_expr(t, BACKWARD)) and p.isnumeric():
            return doclen+1 - int(p)
        elif (g := _get_group(t, MATH)):
            return int( aeval(g) )
        elif req:
            raise ValueError(f"Could not interpret {t!r} as page number.")
        else:
            return None

    page_nums = []

    for t in _split(pt_string):

        if (p := _to_num(t, req=False)) is not None:
            page_nums.append(p)
        elif t == ODD:
            page_nums.extend( range(1, doclen+1, 2) )
        elif t == EVEN:
            page_nums.extend( range(2, doclen+1, 2) )
        elif t == ALL:
            page_nums.extend( range(1, doclen+1) )
        elif (g := _get_group(t, BACKWARD)):
            page_nums.extend( [doclen+1 - p for p in _subparse(g)] )
        elif (g := _get_group(t, REVERSE)):
            page_nums.extend(reversed( _subparse(g) ))

        # TODO operate only outside brackets
        elif EXCLUDE+OPEN in t and t.endswith(CLOSE):
            base, excludes = [_subparse(ps) for ps in t[:-1].split(EXCLUDE+OPEN, maxsplit=1)]
            _apply_excludes(base, excludes)
            page_nums.extend(base)
        elif MULTIPLEX in t:
            base, multiplexer = t.rsplit(MULTIPLEX, maxsplit=1)
            page_nums.extend( _subparse(base) * _to_num(multiplexer) )

        elif RANGE in t:

            step = 1
            start, stop = t.split(RANGE, maxsplit=1)
            if STEP in stop:
                stop, step = stop.split(STEP, maxsplit=1)
                step = _to_num(step)

            start = _to_num(start)
            stop = doclen if stop == "" else _to_num(stop)

            if start < stop:
                page_nums.extend( range(start, stop+1, step) )
            else:
                page_nums.extend( range(start, stop-1, -step) )

        # TODO handle unnecessary brackets

        else:
            raise ValueError(f"Unexpected token: {t!r}.")

    return page_nums


def parse_pagetext(pt_string, doclen, as_indices=True):
    # TODO add check step
    pt_string = pt_string.replace(" ", "").lower().replace(BEGIN, "1").replace(END, str(doclen))
    pages = _parse(pt_string, doclen)
    if as_indices:
        pages = [p-1 for p in pages]
    return pages
	# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
	# SPDX-License-Identifier: CC-BY-4.0

	# Parser for a page text mini-language
	# Technically, this might be a use case for some parser generator, but for now
	# it's implemented "manually" with common string operations

	__all__ = ["parse_pagetext"]

	try:
	import asteval # optional, used to parse math expressions
	aeval = asteval.Interpreter(minimal=True)
	except ImportError:
	def aeval(args, *kwargs):
	raise RuntimeError("Math evaluation requires asteval.")

	SEP = ","
	RANGE = "-"
	STEP = "@"
	BACKWARD = "b"
	MULTIPLEX = "*"
	OPEN = "("
	CLOSE = ")"
	EXCLUDE = "/"
	MATH = "m"
	REVERSE = "r"
	ODD = "odd"
	EVEN = "even"
	ALL = "all"
	BEGIN = "begin"
	END = "end"


	def _split(pt_string):

	splitted = []
	level = 0
	cursor = 0

	for i, c in enumerate(pt_string):
	if c == OPEN:
	level += 1
	elif c == CLOSE:
	level -= 1
	elif level == 0 and c == SEP:
	splitted.append( pt_string[cursor:i] )
	cursor = i + 1

	splitted.append( pt_string[cursor:] )
	assert level == 0

	return [p for p in splitted if p != ""]


	def _apply_excludes(base, excludes):

	not_found = []
	for v in excludes:
	if v in base:
	# in case of multiple occurrences, this removes the leftmost item
	base.remove(v)
	else:
	not_found.append(v)

	if len(not_found) > 0:
	raise ValueError(f"Excessive excludes: {not_found}.")


	def _get_group(t, sym):
	if (t.startswith(sym+OPEN) and t.endswith(CLOSE)):
	return t[2:-1]
	elif (t.startswith(OPEN) and t.endswith(CLOSE+sym)):
	return t[1:-2]
	return None


	def _get_expr(t, sym):
	if t.startswith(sym):
	return t[1:]
	elif t.endswith(sym):
	return t[:-1]
	return None


	def _parse(pt_string, doclen):

	def _subparse(pt_item):
	return _parse(pt_item, doclen)

	def _to_num(t, req=True):
	if t.isnumeric():
	return int(t)
	elif (p := _get_expr(t, BACKWARD)) and p.isnumeric():
	return doclen+1 - int(p)
	elif (g := _get_group(t, MATH)):
	return int( aeval(g) )
	elif req:
	raise ValueError(f"Could not interpret {t!r} as page number.")
	else:
	return None

	page_nums = []

	for t in _split(pt_string):

	if (p := _to_num(t, req=False)) is not None:
	page_nums.append(p)
	elif t == ODD:
	page_nums.extend( range(1, doclen+1, 2) )
	elif t == EVEN:
	page_nums.extend( range(2, doclen+1, 2) )
	elif t == ALL:
	page_nums.extend( range(1, doclen+1) )
	elif (g := _get_group(t, BACKWARD)):
	page_nums.extend( [doclen+1 - p for p in _subparse(g)] )
	elif (g := _get_group(t, REVERSE)):
	page_nums.extend(reversed( _subparse(g) ))

	# TODO operate only outside brackets
	elif EXCLUDE+OPEN in t and t.endswith(CLOSE):
	base, excludes = [_subparse(ps) for ps in t[:-1].split(EXCLUDE+OPEN, maxsplit=1)]
	_apply_excludes(base, excludes)
	page_nums.extend(base)
	elif MULTIPLEX in t:
	base, multiplexer = t.rsplit(MULTIPLEX, maxsplit=1)
	page_nums.extend( _subparse(base) * _to_num(multiplexer) )

	elif RANGE in t:

	step = 1
	start, stop = t.split(RANGE, maxsplit=1)
	if STEP in stop:
	stop, step = stop.split(STEP, maxsplit=1)
	step = _to_num(step)

	start = _to_num(start)
	stop = doclen if stop == "" else _to_num(stop)

	if start < stop:
	page_nums.extend( range(start, stop+1, step) )
	else:
	page_nums.extend( range(start, stop-1, -step) )

	# TODO handle unnecessary brackets

	else:
	raise ValueError(f"Unexpected token: {t!r}.")

	return page_nums


	def parse_pagetext(pt_string, doclen, as_indices=True):
	# TODO add check step
	pt_string = pt_string.replace(" ", "").lower().replace(BEGIN, "1").replace(END, str(doclen))
	pages = _parse(pt_string, doclen)
	if as_indices:
	pages = [p-1 for p in pages]
	return pages