Skip to content

Instantly share code, notes, and snippets.

@macrat
Created April 29, 2024 11:31
Show Gist options
  • Save macrat/cda9d6fec2301fa19c7c138e29ba935e to your computer and use it in GitHub Desktop.
Save macrat/cda9d6fec2301fa19c7c138e29ba935e to your computer and use it in GitHub Desktop.
A simple search query parser

A simple search query parser

Supported syntax:

  • and (a b, a AND b)
  • or (a OR b)
  • not (NOT a)
  • grouping ((a))
package main
import (
"fmt"
"strings"
)
type TokenType int
const (
LPAREN TokenType = iota
RPAREN
AND
OR
NOT
KEYWORD
)
func (tt TokenType) String() string {
switch tt {
case LPAREN:
return "LPAREN"
case RPAREN:
return "RPAREN"
case AND:
return "AND"
case OR:
return "OR"
case NOT:
return "NOT"
case KEYWORD:
return "KEYWORD"
default:
return "UNKNOWN"
}
}
type Token struct {
Type TokenType
Literal string
}
func Lex(s string) (Token, string) {
if len(s) == 0 {
return Token{}, ""
}
check := func(keyword string) bool {
if len(s) < len(keyword)+1 {
return false
}
if s[:len(keyword)] != keyword {
return false
}
return s[len(keyword)] == ' ' || s[len(keyword)] == '(' || s[len(keyword)] == ')'
}
switch s[0] {
case '(':
return Token{LPAREN, "("}, s[1:]
case ')':
return Token{RPAREN, ")"}, s[1:]
case ' ':
return Lex(s[1:])
case 'A':
if check("AND") {
return Token{AND, "AND"}, s[3:]
}
case 'O':
if check("OR") {
return Token{OR, "OR"}, s[2:]
}
case 'N':
if check("NOT") {
return Token{NOT, "NOT"}, s[3:]
}
default:
i := 0
for i < len(s) && s[i] != ' ' && s[i] != '(' && s[i] != ')' {
i++
}
return Token{KEYWORD, s[:i]}, s[i:]
}
return Token{}, ""
}
func Tokenize(s string) []Token {
tokens := []Token{}
for len(s) > 0 {
var token Token
token, s = Lex(s)
if token.Type == AND && len(tokens) > 0 && tokens[len(tokens)-1].Type == AND {
continue
}
tokens = append(tokens, token)
}
return tokens
}
type Node interface {
fmt.Stringer
Minimize() Node
}
type ContainerNode interface {
Node
Push(Node)
Swap(Node)
Paren() bool
}
type And struct {
children []Node
paren bool
}
func (a *And) String() string {
if len(a.children) == 0 {
if a.paren {
return "[AND]"
} else {
return "(AND)"
}
}
xs := make([]string, len(a.children))
for i, child := range a.children {
xs[i] = child.String()
}
if a.paren {
return fmt.Sprintf("[AND %s]", strings.Join(xs, " "))
} else {
return fmt.Sprintf("(AND %s)", strings.Join(xs, " "))
}
}
func (a *And) Minimize() Node {
children := make([]Node, 0, len(a.children))
for _, child := range a.children {
if c := child.Minimize(); c != nil {
if and, ok := c.(*And); ok {
children = append(children, and.children...)
} else {
children = append(children, c)
}
}
}
if len(children) == 0 {
return nil
} else if len(children) == 1 {
return children[0]
}
return &And{children: children, paren: a.paren}
}
func (a *And) Push(node Node) {
a.children = append(a.children, node)
}
func (a *And) Pop() Node {
if len(a.children) == 0 {
return nil
}
node := a.children[len(a.children)-1]
a.children = a.children[:len(a.children)-1]
return node
}
func (a *And) Swap(node Node) {
if len(a.children) == 0 {
a.Push(node)
} else {
a.children[len(a.children)-1] = node
}
}
func (a *And) Paren() bool {
return a.paren
}
type Or struct {
children []Node
paren bool
}
func (o *Or) String() string {
if len(o.children) == 0 {
if o.paren {
return "[OR]"
} else {
return "(OR)"
}
}
xs := make([]string, len(o.children))
for i, child := range o.children {
xs[i] = child.String()
}
if o.paren {
return fmt.Sprintf("[OR %s]", strings.Join(xs, " "))
} else {
return fmt.Sprintf("(OR %s)", strings.Join(xs, " "))
}
}
func (o *Or) Minimize() Node {
children := make([]Node, 0, len(o.children))
for _, child := range o.children {
if c := child.Minimize(); c != nil {
if or, ok := c.(*Or); ok {
children = append(children, or.children...)
} else {
children = append(children, c)
}
}
}
if len(children) == 0 {
return nil
} else if len(children) == 1 {
return children[0]
}
return &Or{children: children, paren: o.paren}
}
func (o *Or) Push(node Node) {
o.children = append(o.children, node)
}
func (o *Or) Pop() Node {
if len(o.children) == 0 {
return nil
}
node := o.children[len(o.children)-1]
o.children = o.children[:len(o.children)-1]
return node
}
func (o *Or) Swap(node Node) {
if len(o.children) == 0 {
o.Push(node)
} else {
o.children[len(o.children)-1] = node
}
}
func (o *Or) Paren() bool {
return o.paren
}
type Not struct {
Child Node
}
func (n *Not) String() string {
if n.Child == nil {
return "(NOT)"
} else {
return fmt.Sprintf("(NOT %s)", n.Child.String())
}
}
func (n *Not) Minimize() Node {
if n.Child == nil {
return nil
}
c := n.Child.Minimize()
if c == nil {
return nil
}
if _, ok := c.(*Not); ok {
return c.(*Not).Child
}
return &Not{Child: c}
}
func (n *Not) Push(node Node) {
n.Child = node
}
func (n *Not) Pop() Node {
node := n.Child
n.Child = nil
return node
}
func (n *Not) Swap(node Node) {
n.Child = node
}
func (n *Not) Paren() bool {
return false
}
type Keyword struct {
Value string
}
func (k *Keyword) String() string {
return fmt.Sprintf("%q", k.Value)
}
func (k *Keyword) Minimize() Node {
return k
}
type Parser struct {
nodes []ContainerNode
}
func NewParser() *Parser {
return &Parser{nodes: []ContainerNode{&And{
paren: true,
}}}
}
func (p *Parser) Top() ContainerNode {
return p.nodes[len(p.nodes)-1]
}
func (p *Parser) Pop() {
if len(p.nodes) > 1 {
p.nodes = p.nodes[:len(p.nodes)-1]
} else {
p.nodes[0] = &And{
children: []Node{p.nodes[0]},
paren: true,
}
}
switch top := p.Top().(type) {
case *And:
if len(top.children) == 1 {
top.Swap(top.Pop())
}
case *Not:
p.Pop()
}
}
func (p *Parser) Push(node ContainerNode) {
p.Top().Push(node)
p.nodes = append(p.nodes, node)
}
func (p *Parser) Swap(node ContainerNode) {
p.nodes[len(p.nodes)-1] = node
if len(p.nodes) > 1 {
p.nodes[len(p.nodes)-2].Swap(node)
}
}
func (p *Parser) AndPush(node Node) {
switch top := p.Top().(type) {
case *And:
top.Push(node)
case *Or:
and := &And{children: []Node{top.Pop(), node}}
p.Push(and)
case *Not:
top.Child = node
default:
panic("unreachable")
}
}
func (p *Parser) OrPush(node Node) {
switch top := p.Top().(type) {
case *And:
or := &Or{children: []Node{p.Top(), node}, paren: top.paren}
if len(top.children) == 1 {
or.children[0] = top.Pop()
} else {
top.paren = false
}
p.Swap(or)
case *Or:
top.Push(node)
case *Not:
top.Child = node
default:
panic("unreachable")
}
}
type PushMode bool
const (
PUSH_AND PushMode = false
PUSH_OR PushMode = true
)
func (p *Parser) PushNode(node Node, mode PushMode) {
if mode == PUSH_OR {
p.OrPush(node)
} else {
p.AndPush(node)
}
}
func (p *Parser) Parse(tokens []Token) Node {
mode := PUSH_AND
for _, token := range tokens {
switch token.Type {
case LPAREN:
and := &And{
paren: true,
}
p.PushNode(and, mode)
p.nodes = append(p.nodes, and)
mode = PUSH_AND
case RPAREN:
for !p.Top().Paren() {
p.Pop()
}
p.Pop()
mode = PUSH_AND
case AND:
mode = PUSH_AND
case OR:
mode = PUSH_OR
case NOT:
if _, ok := p.Top().(*Not); ok {
p.Pop()
} else {
not := &Not{}
p.PushNode(not, mode)
p.nodes = append(p.nodes, not)
}
case KEYWORD:
p.PushNode(&Keyword{Value: token.Literal}, mode)
mode = PUSH_AND
default:
panic("unreachable")
}
fmt.Printf("\n### %s(%q) stack: %d\n %s\n", token.Type, token.Literal, len(p.nodes), p.nodes[0])
}
return p.nodes[0]
}
func Parse(input string) Node {
tokens := Tokenize(input)
parser := NewParser()
return parser.Parse(tokens)
}
func main() {
input := "a OR NOT (b OR c d) OR e"
fmt.Println("Input:")
fmt.Println("", input)
node := Parse(input)
fmt.Println("\nRaw AST:")
fmt.Println("", node)
fmt.Println("\nMinified AST:")
fmt.Println("", node.Minimize())
}
package main
import (
"testing"
)
func TestParse(t *testing.T) {
tests := []struct {
input string
output string
}{
{"a", `"a"`},
{"a b", `[AND "a" "b"]`},
{"a b c", `[AND "a" "b" "c"]`},
{"a OR b", `[OR "a" "b"]`},
{"a AND b OR c", `[OR (AND "a" "b") "c"]`},
{"a OR b AND c", `[OR "a" (AND "b" "c")]`},
{"a AND b OR c AND d", `[OR (AND "a" "b") (AND "c" "d")]`},
{"a OR b AND c OR d", `[OR "a" (AND "b" "c") "d"]`},
{"NOT a", `(NOT "a")`},
{"a OR NOT b", `[OR "a" (NOT "b")]`},
{"(a)", `"a"`},
{"(a b)", `[AND "a" "b"]`},
{"(a OR b)", `[OR "a" "b"]`},
{"a (b OR c)", `[AND "a" [OR "b" "c"]]`},
{"(a OR b) c", `[AND [OR "a" "b"] "c"]`},
{"(a OR b) (c OR d)", `[AND [OR "a" "b"] [OR "c" "d"]]`},
{"((a OR b) AND c) OR d", `[OR [AND [OR "a" "b"] "c"] "d"]`},
{"a OR NOT (b OR c) OR d", `[OR "a" (NOT [OR "b" "c"]) "d"]`},
{"a OR NOT (b OR c AND d) OR e", `[OR "a" (NOT [OR "b" (AND "c" "d")]) "e"]`},
{"a OR (b OR c) OR d", `[OR "a" "b" "c" "d"]`},
{"a OR (b AND c) OR d", `[OR "a" [AND "b" "c"] "d"]`},
{"a OR b) c", `[AND [OR "a" "b"] "c"]`},
{"a (b OR c", `[AND "a" [OR "b" "c"]]`},
{"NOT NOT a", `"a"`},
}
for _, test := range tests {
node := Parse(test.input).Minimize()
if node.String() != test.output {
t.Errorf("%q\nwant: %s\n got: %s", test.input, test.output, node.String())
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment