Created
January 26, 2023 21:12
-
-
Save niklasl/cc5df8d5500a3e813e2bcf1a48260213 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A simple query language parser experiment. | |
""" | |
OPERATORS: dict[str, tuple[str | None, int, str | int]] = { | |
'and': ('AND', -1, 'AND'), | |
'or': ('OR', -1, 'AND'), | |
'not': ('NOT', 0, 1), | |
'^': ('INV', 0, 1), | |
'=': ('EQ', -1, 1), | |
'!=': ('NEQ', -1, 1), | |
'=~': ('MATCH', 0, 1), | |
} | |
for alias, key in [('&&', 'and'), ('||', 'or')]: | |
OPERATORS[alias] = OPERATORS[key] | |
SPECIALS = {key for key in OPERATORS if len(key) == 1} | |
def tokenize(s: str, at=0) -> tuple[list, int]: | |
tokens: list = [] | |
word: list[str] = [] | |
quote_start = False | |
escaped = False | |
while at < len(s): | |
c = s[at] | |
at += 1 | |
if escaped: | |
escaped = False | |
word.append(c) | |
continue | |
if c == '\\': | |
escaped = True | |
continue | |
if quote_start and c != '"': | |
word.append(c) | |
continue | |
if c == '"': | |
quote_start = not quote_start | |
continue | |
if c == '(': | |
item, at = tokenize(s, at) | |
tokens.append(item) | |
continue | |
elif c == ')': | |
_add(tokens, word) | |
word = [] | |
return tokens, at | |
if c in SPECIALS: | |
_add(tokens, word) | |
word = [] | |
_add(tokens, [c]) | |
elif c in {' ', '\n', '\r', '\t'}: | |
_add(tokens, word) | |
word = [] | |
else: | |
word.append(c) | |
if word: | |
_add(tokens, word) | |
return tokens, at | |
def _add(tokens: list, word: list[str]): | |
if word: | |
tokens.append(''.join(word) if isinstance(word, list) else word) | |
def parse(tokens: list, slurp_until: str | int = 0) -> list: | |
tree: list[object] = [] | |
item = None | |
while tokens: | |
item = tokens.pop(0) | |
if isinstance(item, list): | |
tree.append(parse(item)) | |
elif isinstance(item, str): | |
operator, prev, next_slurp_until = _get_operator(item) | |
if operator is not None: | |
collected = parse(tokens, next_slurp_until) | |
if prev == -1: | |
collected.insert(0, tree.pop(0)) | |
if isinstance(next_slurp_until, str): | |
tree.append({operator: collected}) | |
return tree | |
else: | |
tree.append({operator: collected}) | |
else: | |
tree.append(item) | |
else: | |
raise ValueError(f"Unknown item: {item}") | |
if slurp_until == item: | |
break | |
if isinstance(slurp_until, int) and len(tree) == slurp_until: | |
break | |
return tree | |
def _get_operator(word: str) -> tuple[str | None, int, str | int]: | |
return OPERATORS.get(word.lower(), (None, 0, 0)) | |
if __name__ == '__main__': | |
q = r""" | |
a = "A A A" and (b = "B B" or c = "C = \"C\" or B != \\n") | |
and (not d or ^e) | |
and ( | |
not ( | |
(f and not (g or h)) | |
or i | |
) | |
) | |
""" | |
print(q) | |
tokentree, at = tokenize(q) | |
print(tokentree) | |
import json | |
tree = parse(tokentree[:]) | |
print(json.dumps(tree, indent=2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment