Skip to content

Instantly share code, notes, and snippets.

@elvanja
Last active November 27, 2023 14:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save elvanja/202b31ae5735f297956e1d8f567c7d7e to your computer and use it in GitHub Desktop.
Save elvanja/202b31ae5735f297956e1d8f567c7d7e to your computer and use it in GitHub Desktop.
Boolean query lexer (not AST unfortunately)
defmodule BooleanQuery do
use Ecto.Type
import Gettext
def type, do: :string
@operator_expressions %{
"AND" => {:operator, "+"},
"UND" => {:operator, "+"},
"OR" => {:operator, "|"},
"ODER" => {:operator, "|"},
"NOT" => {:operator, "-"},
"NICHT" => {:operator, "-"},
"(" => {:operator, "("},
")" => {:operator, ")"}
}
@doc """
Converts input boolean search string into list of terms and operators, to be used for ES searching as needed
see https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html#simple-query-string-syntax
## Examples
iex> BooleanQuery.cast("Java OR Ruby")
{:ok, [{:term, "java"}, {:operator, "|"}, {:term, "ruby"}]}
"""
def cast(query) when is_binary(query) do
with {:ok, terms} <- lex(query) do
terms
|> parse()
|> detect_invalid_multiple_operators()
end
end
def cast(_), do: :error
def load(_data), do: :error
def dump(_), do: :error
defp lex(query) do
result =
query
|> String.replace("\n", " ")
|> String.split("")
|> Enum.reduce({"", [], false, 0}, &scan/2)
case result do
{term, terms, false, 0} ->
{:ok, Enum.reject([term | terms], &(&1 == ""))}
{_, _, quotes, parenthesis} ->
message =
case {quotes, parenthesis} do
{true, 0} -> gettext("unclosed quotes detected")
{false, _} -> gettext("unclosed parenthesis detected")
_ -> gettext("unclosed parenthesis and/or quotes detected")
end
{:error, [{:message, message}]}
end
end
defp parse(terms) do
terms
|> detect_operators()
|> concat_consecutive_terms()
|> nest()
|> consolidate_prefix_operators()
|> remove_suffix_operators()
end
# ==> QUOTED BEGIN
# does not handle nested quotes
defp scan("\"", {_term, terms, false, parenthesis}) do
{"", terms, true, parenthesis}
end
defp scan("\"", {term, terms, true, parenthesis}) do
{"", terms ++ [~s{"#{term}"}], false, parenthesis}
end
defp scan(char, {term, terms, true, parenthesis}) do
{term <> char, terms, true, parenthesis}
end
# ==> QUOTED END
# ==> PARENTHESIS BEGIN
defp scan("(", {term, terms, false, parenthesis}) do
{"", terms ++ [term, "("], false, parenthesis + 1}
end
defp scan(")", {term, terms, false, parenthesis}) do
{"", terms ++ [term, ")"], false, parenthesis - 1}
end
# ==> PARENTHESIS END
# ==> WHITESPACE BEGIN
defp scan("", {term, terms, false, parenthesis}) do
{"", terms ++ [term], false, parenthesis}
end
defp scan(" ", {term, terms, false, parenthesis}) do
{"", terms ++ [term], false, parenthesis}
end
# ==> WHITESPACE END
# ==> THE REST BEGIN
defp scan(char, {term, terms, false, parenthesis}) do
{term <> char, terms, false, parenthesis}
end
# ==> THE REST END
defp detect_operators(terms) do
terms
|> Enum.map(fn term ->
@operator_expressions[String.upcase(term)] || {:term, term}
end)
|> detect_and_not()
end
defp detect_and_not([{:operator, "+"} | rest]) do
case rest do
[{:operator, "-"} | after_and_not] -> [{:operator, "+-"} | detect_and_not(after_and_not)]
_ -> [{:operator, "+"} | detect_and_not(rest)]
end
end
defp detect_and_not([term_or_operator | rest]) do
[term_or_operator | detect_and_not(rest)]
end
defp detect_and_not([]), do: []
defp concat_consecutive_terms([{:term, current} | rest]) do
case rest do
[{:term, next} | after_next] -> concat_consecutive_terms([{:term, current <> " " <> next} | after_next])
_ -> [{:term, current} | concat_consecutive_terms(rest)]
end
end
defp concat_consecutive_terms([operator | rest]) do
[operator | concat_consecutive_terms(rest)]
end
defp concat_consecutive_terms([]), do: []
defp nest([]), do: []
defp nest([{:operator, "("} | rest]) do
nest([{:nested, []} | rest])
end
defp nest([{:nested, nested} | rest]) do
case rest do
[{:operator, "("} | _] = reopened ->
[{:nested, _} = re_nested | remainder] = nest(reopened)
nest([{:nested, nested ++ [re_nested]} | remainder])
[{:operator, ")"} | after_closing] ->
[{:nested, nested} | nest(after_closing)]
[term_or_operator | remainder] ->
nest([{:nested, nested ++ [term_or_operator]} | remainder])
end
end
defp nest([term_or_operator | rest]) do
[term_or_operator | nest(rest)]
end
# credo:disable-for-lines:38 Credo.Check.Refactor.CyclomaticComplexity
defp consolidate_prefix_operators(list) do
{list, _, _} =
Enum.reduce(list, {[], [], false}, fn term_or_operator, {list, prefix_operators, consolidated} ->
case {term_or_operator, consolidated} do
{{:operator, _}, false} ->
{list, [term_or_operator | prefix_operators], consolidated}
{{:term, _}, false} ->
list =
case operator = List.first(prefix_operators) do
{:operator, "-"} -> [operator | list]
{:operator, "+-"} -> [operator | list]
_ -> list
end
{[term_or_operator | list], [], true}
{{:nested, nested}, false} ->
# credo:disable-for-lines:6 Credo.Check.Refactor.Nesting
list =
case operator = List.first(prefix_operators) do
{:operator, "-"} -> [operator | list]
{:operator, "+-"} -> [operator | list]
_ -> list
end
{[{:nested, consolidate_prefix_operators(nested)} | list], [], true}
{{:nested, nested}, true} ->
{[{:nested, consolidate_prefix_operators(nested)} | list], prefix_operators, consolidated}
{_, true} ->
{[term_or_operator | list], prefix_operators, consolidated}
end
end)
Enum.reverse(list)
end
defp remove_suffix_operators(list) do
{list, _} =
list
|> Enum.reverse()
|> Enum.reduce({[], false}, fn term_or_operator, {list, removed} ->
case {term_or_operator, removed} do
{{:operator, _}, false} ->
{list, removed}
{{:term, _}, false} ->
{[term_or_operator | list], true}
{{:nested, nested}, false} ->
{[{:nested, remove_suffix_operators(nested)} | list], true}
{{:nested, nested}, true} ->
{[{:nested, remove_suffix_operators(nested)} | list], removed}
{_, true} ->
{[term_or_operator | list], removed}
end
end)
list
end
defp detect_invalid_multiple_operators(list) do
case do_detect_invalid_multiple_operators(list) do
{_, nil} ->
{:ok, list}
{_, [previous, current]} ->
operators =
[previous, current]
|> Enum.map_join(", ", &translate/1)
message = gettext("consecutive boolean operators detected: %{operators}", operators: operators)
{:error, [{:message, message}]}
end
end
defp do_detect_invalid_multiple_operators(list) do
Enum.reduce_while(list, {nil, nil}, fn term_or_operator, {last_term_or_operator, _} ->
case {term_or_operator, last_term_or_operator} do
{{:term, _}, _} ->
{:cont, {nil, nil}}
{{:operator, _}, nil} ->
{:cont, {term_or_operator, nil}}
{{:operator, current}, {:operator, previous}} ->
{:halt, {nil, [previous, current]}}
{{:nested, nested}, _} ->
# credo:disable-for-lines:4 Credo.Check.Refactor.Nesting
case do_detect_invalid_multiple_operators(nested) do
{_, nil} -> {:cont, {nil, nil}}
{_, [previous, current]} -> {:halt, {nil, [previous, current]}}
end
end
end)
end
defp translate("+"), do: gettext("AND")
defp translate("|"), do: gettext("OR")
defp translate("-"), do: gettext("NOT")
defp translate("+-"), do: gettext("AND NOT")
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment