Skip to content

Instantly share code, notes, and snippets.

@medihack
Last active May 15, 2024 13:45
Show Gist options
  • Save medihack/581957e105f2bea591f531c77587fdd2 to your computer and use it in GitHub Desktop.
Save medihack/581957e105f2bea591f531c77587fdd2 to your computer and use it in GitHub Desktop.
A pyparsing parser that parses Googe like search strings
import pyparsing as pp
from typing import Literal
class TermNode:
def __init__(self, term_type: Literal["WORD", "PHRASE"], value: "Node"):
self.term_type = term_type
self.value = value
def __repr__(self):
return f"TermNode({self.term_type}, {self.value})"
class UnaryNode:
def __init__(self, operator: Literal["NOT"], operand: "Node"):
self.operator = operator
self.operand = operand
def __repr__(self):
return f"UnaryNode({self.operator}, {self.operand})"
class BinaryNode:
def __init__(self, operator: Literal["AND", "OR"], left: "Node", right: "Node"):
self.operator = operator
self.left = left
self.right = right
def __repr__(self):
return f"BinaryNode({self.operator}, {self.left}, {self.right})"
Node = TermNode | UnaryNode | BinaryNode
not_ = pp.Keyword("NOT")
and_ = pp.Keyword("AND")
or_ = pp.Keyword("OR")
lparen = pp.Literal("(")
rparen = pp.Literal(")")
extra_chars = "_-'"
word = ~(not_ | and_ | or_) + pp.Word(pp.alphanums + pp.alphas8bit + extra_chars).set_parse_action(lambda t: TermNode("WORD", t[0]))
phrase = pp.QuotedString(quoteChar='"').set_parse_action(lambda t: TermNode("PHRASE", t[0]))
term = (phrase | word)
or_expression = pp.Forward()
parens_expression = pp.Forward()
parens_expression <<= (pp.Suppress(lparen) + or_expression + pp.Suppress(rparen)) | term
not_expression = pp.Forward()
not_expression <<= (not_ + not_expression).set_parse_action(lambda t: UnaryNode("NOT", t[1])) | parens_expression
and_expression = pp.Forward()
and_expression <<= (not_expression + and_ + and_expression).set_parse_action(lambda t: BinaryNode("AND", t[0], t[2])) | (not_expression + and_expression).set_parse_action(lambda t: BinaryNode("AND", t[0], t[1])) | not_expression
or_expression <<= (and_expression + or_ + or_expression).set_parse_action(lambda t: BinaryNode("OR", t[0], t[2])) | and_expression
#or_expression.parse_string('', parse_all=True)
or_expression.run_tests("""\
###
# Valid expressions
###
# Word term
foobar
# Umlaute in word term
Gürtel
# Phrase term
"foo bar"
# Special characters in phrase
"foo!~ bar %"
# Implicit AND
foo bar
# Explicit AND
foo AND bar
# Explicit OR
foo OR bar
# NOT
NOT foo
# Parenthesis
foo AND (bar OR baz)
# Complex expression 1
NOT foo AND ("bar baz" OR qux)
# Complex expression 2
foo AND (NOT "bar baz" (moo OR zoo) AND yoo)
# Complex expression 3
foo (bar NOT "baz moo") zoo
###
# Invalid expressions
###
# Unary before binary operator
foo NOT AND bar
# Unknown char outside quotes
foo ~ bar
# Binary operator at start of line
AND foo
# Binary operator at start of parens expression
(AND bar)
# Binary operator at end of line
foo AND
# Binary operator at end of parens expression
(foo AND)
# Unary operator at end of line
foo NOT
# Unary operator at end of parens expression
(foo NOT)
# Unbalanced parens
((foo)
# Unbalanced quotes
""foo"
""");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment