niklasl/parsequery.py

## parsequery.py
"""
A simple query language parser experiment.
"""

OPERATORS: dict[str, tuple[str | None, int, str | int]] = {
    'and': ('AND', -1, 'AND'),
    'or': ('OR', -1, 'AND'),
    'not': ('NOT', 0, 1),
    '^': ('INV', 0, 1),
    '=': ('EQ', -1, 1),
    '!=': ('NEQ', -1, 1),
    '=~': ('MATCH', 0, 1),
}
for alias, key in [('&&', 'and'), ('||', 'or')]:
    OPERATORS[alias] = OPERATORS[key]


SPECIALS = {key for key in OPERATORS if len(key) == 1}


def tokenize(s: str, at=0) -> tuple[list, int]:
    tokens: list = []
    word: list[str] = []
    quote_start = False
    escaped = False

    while at < len(s):
        c = s[at]
        at += 1

        if escaped:
            escaped = False
            word.append(c)
            continue

        if c == '\\':
            escaped = True
            continue

        if quote_start and c != '"':
            word.append(c)
            continue

        if c == '"':
            quote_start = not quote_start
            continue

        if c == '(':
            item, at = tokenize(s, at)
            tokens.append(item)
            continue
        elif c == ')':
            _add(tokens, word)
            word = []
            return tokens, at
        if c in SPECIALS:
            _add(tokens, word)
            word = []
            _add(tokens, [c])
        elif c in {' ', '\n', '\r', '\t'}:
            _add(tokens, word)
            word = []
        else:
            word.append(c)

    if word:
        _add(tokens, word)

    return tokens, at


def _add(tokens: list, word: list[str]):
    if word:
        tokens.append(''.join(word) if isinstance(word, list) else word)


def parse(tokens: list, slurp_until: str | int = 0) -> list:
    tree: list[object] = []
    item = None
    while tokens:
        item = tokens.pop(0)

        if isinstance(item, list):
            tree.append(parse(item))
        elif isinstance(item, str):
            operator, prev, next_slurp_until = _get_operator(item)
            if operator is not None:
                collected = parse(tokens, next_slurp_until)
                if prev == -1:
                    collected.insert(0, tree.pop(0))
                if isinstance(next_slurp_until, str):
                    tree.append({operator: collected})
                    return tree
                else:
                    tree.append({operator: collected})
            else:
                tree.append(item)
        else:
            raise ValueError(f"Unknown item: {item}")

        if slurp_until == item:
            break

        if isinstance(slurp_until, int) and len(tree) == slurp_until:
            break

    return tree


def _get_operator(word: str) -> tuple[str | None, int, str | int]:
    return OPERATORS.get(word.lower(), (None, 0, 0))


if __name__ == '__main__':
    q = r"""
        a = "A A A" and (b = "B B" or c = "C = \"C\" or B != \\n")
        and (not d or ^e)
        and (
            not (
                (f and not (g or h))
                or i
            )
        )
    """

    print(q)

    tokentree, at = tokenize(q)
    print(tokentree)

    import json

    tree = parse(tokentree[:])
    print(json.dumps(tree, indent=2))
	"""
	A simple query language parser experiment.
	"""

	OPERATORS: dict[str, tuple[str \| None, int, str \| int]] = {
	'and': ('AND', -1, 'AND'),
	'or': ('OR', -1, 'AND'),
	'not': ('NOT', 0, 1),
	'^': ('INV', 0, 1),
	'=': ('EQ', -1, 1),
	'!=': ('NEQ', -1, 1),
	'=~': ('MATCH', 0, 1),
	}
	for alias, key in [('&&', 'and'), ('\|\|', 'or')]:
	OPERATORS[alias] = OPERATORS[key]


	SPECIALS = {key for key in OPERATORS if len(key) == 1}


	def tokenize(s: str, at=0) -> tuple[list, int]:
	tokens: list = []
	word: list[str] = []
	quote_start = False
	escaped = False

	while at < len(s):
	c = s[at]
	at += 1

	if escaped:
	escaped = False
	word.append(c)
	continue

	if c == '\\':
	escaped = True
	continue

	if quote_start and c != '"':
	word.append(c)
	continue

	if c == '"':
	quote_start = not quote_start
	continue

	if c == '(':
	item, at = tokenize(s, at)
	tokens.append(item)
	continue
	elif c == ')':
	_add(tokens, word)
	word = []
	return tokens, at
	if c in SPECIALS:
	_add(tokens, word)
	word = []
	_add(tokens, [c])
	elif c in {' ', '\n', '\r', '\t'}:
	_add(tokens, word)
	word = []
	else:
	word.append(c)

	if word:
	_add(tokens, word)

	return tokens, at


	def _add(tokens: list, word: list[str]):
	if word:
	tokens.append(''.join(word) if isinstance(word, list) else word)


	def parse(tokens: list, slurp_until: str \| int = 0) -> list:
	tree: list[object] = []
	item = None
	while tokens:
	item = tokens.pop(0)

	if isinstance(item, list):
	tree.append(parse(item))
	elif isinstance(item, str):
	operator, prev, next_slurp_until = _get_operator(item)
	if operator is not None:
	collected = parse(tokens, next_slurp_until)
	if prev == -1:
	collected.insert(0, tree.pop(0))
	if isinstance(next_slurp_until, str):
	tree.append({operator: collected})
	return tree
	else:
	tree.append({operator: collected})
	else:
	tree.append(item)
	else:
	raise ValueError(f"Unknown item: {item}")

	if slurp_until == item:
	break

	if isinstance(slurp_until, int) and len(tree) == slurp_until:
	break

	return tree


	def _get_operator(word: str) -> tuple[str \| None, int, str \| int]:
	return OPERATORS.get(word.lower(), (None, 0, 0))


	if __name__ == '__main__':
	q = r"""
	a = "A A A" and (b = "B B" or c = "C = \"C\" or B != \\n")
	and (not d or ^e)
	and (
	not (
	(f and not (g or h))
	or i
	)
	)
	"""

	print(q)

	tokentree, at = tokenize(q)
	print(tokentree)

	import json

	tree = parse(tokentree[:])
	print(json.dumps(tree, indent=2))