Last active
May 15, 2024 13:45
-
-
Save medihack/581957e105f2bea591f531c77587fdd2 to your computer and use it in GitHub Desktop.
A pyparsing parser that parses Googe like search strings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyparsing as pp | |
from typing import Literal | |
class TermNode: | |
def __init__(self, term_type: Literal["WORD", "PHRASE"], value: "Node"): | |
self.term_type = term_type | |
self.value = value | |
def __repr__(self): | |
return f"TermNode({self.term_type}, {self.value})" | |
class UnaryNode: | |
def __init__(self, operator: Literal["NOT"], operand: "Node"): | |
self.operator = operator | |
self.operand = operand | |
def __repr__(self): | |
return f"UnaryNode({self.operator}, {self.operand})" | |
class BinaryNode: | |
def __init__(self, operator: Literal["AND", "OR"], left: "Node", right: "Node"): | |
self.operator = operator | |
self.left = left | |
self.right = right | |
def __repr__(self): | |
return f"BinaryNode({self.operator}, {self.left}, {self.right})" | |
Node = TermNode | UnaryNode | BinaryNode | |
not_ = pp.Keyword("NOT") | |
and_ = pp.Keyword("AND") | |
or_ = pp.Keyword("OR") | |
lparen = pp.Literal("(") | |
rparen = pp.Literal(")") | |
extra_chars = "_-'" | |
word = ~(not_ | and_ | or_) + pp.Word(pp.alphanums + pp.alphas8bit + extra_chars).set_parse_action(lambda t: TermNode("WORD", t[0])) | |
phrase = pp.QuotedString(quoteChar='"').set_parse_action(lambda t: TermNode("PHRASE", t[0])) | |
term = (phrase | word) | |
or_expression = pp.Forward() | |
parens_expression = pp.Forward() | |
parens_expression <<= (pp.Suppress(lparen) + or_expression + pp.Suppress(rparen)) | term | |
not_expression = pp.Forward() | |
not_expression <<= (not_ + not_expression).set_parse_action(lambda t: UnaryNode("NOT", t[1])) | parens_expression | |
and_expression = pp.Forward() | |
and_expression <<= (not_expression + and_ + and_expression).set_parse_action(lambda t: BinaryNode("AND", t[0], t[2])) | (not_expression + and_expression).set_parse_action(lambda t: BinaryNode("AND", t[0], t[1])) | not_expression | |
or_expression <<= (and_expression + or_ + or_expression).set_parse_action(lambda t: BinaryNode("OR", t[0], t[2])) | and_expression | |
#or_expression.parse_string('', parse_all=True) | |
or_expression.run_tests("""\ | |
### | |
# Valid expressions | |
### | |
# Word term | |
foobar | |
# Umlaute in word term | |
Gürtel | |
# Phrase term | |
"foo bar" | |
# Special characters in phrase | |
"foo!~ bar %" | |
# Implicit AND | |
foo bar | |
# Explicit AND | |
foo AND bar | |
# Explicit OR | |
foo OR bar | |
# NOT | |
NOT foo | |
# Parenthesis | |
foo AND (bar OR baz) | |
# Complex expression 1 | |
NOT foo AND ("bar baz" OR qux) | |
# Complex expression 2 | |
foo AND (NOT "bar baz" (moo OR zoo) AND yoo) | |
# Complex expression 3 | |
foo (bar NOT "baz moo") zoo | |
### | |
# Invalid expressions | |
### | |
# Unary before binary operator | |
foo NOT AND bar | |
# Unknown char outside quotes | |
foo ~ bar | |
# Binary operator at start of line | |
AND foo | |
# Binary operator at start of parens expression | |
(AND bar) | |
# Binary operator at end of line | |
foo AND | |
# Binary operator at end of parens expression | |
(foo AND) | |
# Unary operator at end of line | |
foo NOT | |
# Unary operator at end of parens expression | |
(foo NOT) | |
# Unbalanced parens | |
((foo) | |
# Unbalanced quotes | |
""foo" | |
"""); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment