Skip to content

Instantly share code, notes, and snippets.

@inoas
Created July 5, 2022 15:36
Show Gist options
  • Save inoas/19eedcdd1d0da03cd180d1c4ba29be34 to your computer and use it in GitHub Desktop.
Save inoas/19eedcdd1d0da03cd180d1c4ba29be34 to your computer and use it in GitHub Desktop.
sanitizer.ex
# Script for searching for input. You can run it as:
#
# mix run priv/sanitizer.exs
#
defmodule SearchInputSanitizer do
require Integer
def remove_ascii_control_characters(string) do
Regex.replace(~r/\p{C}/u, string, " ")
end
def space_pad_double_quotes(string) do
Regex.replace(~r/"/u, string, " \" ")
end
def remove_surplus_whitespaces(string) do
Regex.replace(~r/\s+/u, string, " ")
|> String.trim()
end
def take_graphemes_at_max_bytes(string, max_bytes) do
string
|> String.graphemes()
|> Enum.reduce_while("", fn character, accumulator ->
if byte_size(accumulator <> character) <= max_bytes do
{:cont, accumulator <> character}
else
{:halt, accumulator}
end
end)
end
# Append quote char if not balanced even
def balance_double_quotes(string) do
string =
if string |> String.split("") |> Enum.count(&(&1 == "\"")) |> Integer.is_odd(),
do: string <> "\"",
else: string
# Replace potentially empty quotes
Regex.replace(~r/"\s{0,}"/u, string, "")
end
def balance_non_quoted_round_brackets(string, round_bracket_nesting_limit) do
string
|> String.graphemes()
|> Enum.reduce({"", false, 0}, fn character, accumulator ->
{new_string, is_within_quoted_string, opened_count} = accumulator
case {character, is_within_quoted_string} do
# Toggle if we are within quotes - round brackets within quotes are being ignored
{"\"", is_within_quoted_string} ->
{new_string <> character, not is_within_quoted_string, opened_count}
{"(", true} ->
{new_string <> "(", is_within_quoted_string, opened_count}
{")", true} ->
{new_string <> ")", is_within_quoted_string, opened_count}
{"(", false} ->
if opened_count + 1 <= round_bracket_nesting_limit do
{new_string <> " ( ", is_within_quoted_string, opened_count + 1}
else
{new_string, is_within_quoted_string, opened_count}
end
{")", false} ->
# If there are no opened round brackets, then we cannot close them
if opened_count == 0 do
{new_string, is_within_quoted_string, opened_count}
else
{new_string <> " ) ", is_within_quoted_string, opened_count - 1}
end
_ ->
{new_string <> character, is_within_quoted_string, opened_count}
end
end)
# Finally we close any surplus opened round bracket
|> then(fn {new_string, _is_within_quoted_string, opened_count} ->
new_string <> String.duplicate(" ) ", opened_count)
end)
end
def remove_inner_white_space_padding_within_quoted_sub_strings(string) do
reducer_fn = fn character, accumulator ->
{new_string, is_within_quoted_string, do_ignore_next_space} = accumulator
case {character, is_within_quoted_string, do_ignore_next_space} do
# Toggle if we are within quotes - round brackets within quotes are being ignored
{"\"", _, _} ->
{new_string <> character, not is_within_quoted_string, not is_within_quoted_string}
# Encounter whitespace within quotes while do_ignore_next_space is true
{" ", true, true} ->
{new_string, true, true}
# Encounter whitespace within quotes while do_ignore_next_space is false
{" ", true, false} ->
{new_string <> character, true, true}
# Encounter whitespace outside of quotes, simply append
{" ", false, _} ->
{new_string <> character, is_within_quoted_string, false}
# Encounter any other character, reset do_ignore_next_space
_ ->
{new_string <> character, is_within_quoted_string, false}
end
end
# TODO: Do not call String.graphemes/1 String.reverse/1 multiple times but work on list of char and benchmark with https://hexdocs.pm/benchee/readme.html
string
|> String.graphemes()
|> Enum.reduce({"", false, false}, reducer_fn)
|> then(fn {new_string, _is_within_quoted_string, _do_ignore_next_space} ->
new_string
end)
|> String.reverse()
|> String.graphemes()
|> Enum.reduce({"", false, false}, reducer_fn)
|> then(fn {new_string, _is_within_quoted_string, _do_ignore_next_space} ->
new_string
end)
|> String.reverse()
end
def balance_logical_operators(string) do
reducer_fn = fn character, accumulator ->
{direction, new_string, is_within_quoted_string, can_next_be_operator} = accumulator
case {direction, character, is_within_quoted_string, can_next_be_operator} do
# Toggle if we are within quotes - operators within quotes are being ignored
{direction, "\"", _, _} ->
{direction, new_string <> character, not is_within_quoted_string,
not is_within_quoted_string}
# Encounter and-operator outside quotes while can_next_be_operator == true: append & can_next_be_operator = true
{direction, "&", false, true} ->
{direction, new_string <> " " <> character <> " ", false, false}
# Encounter and-operator outside quotes while can_next_be_operator == false: ignore & can_next_be_operator = true
{direction, "&", false, false} ->
{direction, new_string, false, false}
# Encounter or-operator outside quotes while can_next_be_operator == true: append & can_next_be_operator = true
{direction, "|", false, true} ->
{direction, new_string <> " " <> character <> " ", false, false}
# Encounter or-operator outside quotes while can_next_be_operator == false: ignore & can_next_be_operator = true
{direction, "|", false, false} ->
{direction, new_string, false, false}
# Encounter whitespace outside of quotes: append & pass through can_next_be_operator
{direction, " ", false, _} ->
{direction, new_string <> character, false, can_next_be_operator}
# Encounter an opening bracket in LTR mode: append & can_next_be_operator = false
{"LTR", "(", false, false} ->
{direction, new_string <> character, false, false}
# Encounter an opening bracket in RTL mode: append & can_next_be_operator = true
{"RTL", "(", false, false} ->
{direction, new_string <> character, false, true}
# Encounter a closing bracket in RTL mode: append & can_next_be_operator = false
{"RTL", ")", false, false} ->
{direction, new_string <> character, false, false}
# Encounter a closing bracket in LTR mode: append & can_next_be_operator = true
{"LTR", ")", false, false} ->
{direction, new_string <> character, false, true}
# Encounter any other character, outside of quotes: ignore & can_next_be_operator = false
{direction, _, false, _} ->
{direction, new_string <> character, false, true}
# Encounter any other character, inside of quotes: append & can_next_be_operator = false
{direction, _, true, _} ->
{direction, new_string <> character, true, true}
end
end
# TODO: Do not call String.graphemes/1 String.reverse/1 multiple times but work on list of char and benchmark with https://hexdocs.pm/benchee/readme.html
string
|> String.graphemes()
|> Enum.reduce({"LTR", "", false, false}, reducer_fn)
|> then(fn {_direction, new_string, _is_within_quoted_string, _can_next_be_operator} ->
new_string
end)
|> String.reverse()
|> String.graphemes()
|> Enum.reduce({"RTL", "", false, false}, reducer_fn)
|> then(fn {_direction, new_string, _is_within_quoted_string, _can_next_be_operator} ->
new_string
end)
|> String.reverse()
end
def remove_empty_round_brackets(string, 0) do
string
end
def remove_empty_round_brackets(string, n) do
remove_empty_round_brackets(
Regex.replace(~r/\({#{Integer.to_string(n)}}\s*\){#{Integer.to_string(n)}}/, string, ""),
n - 1
)
end
def remove_inner_white_space_padding_within_round_brackets(string) do
reducer_fn = fn character, accumulator ->
{direction, new_string, is_within_quoted_string, is_prev_char_bracket} = accumulator
case {direction, character, is_within_quoted_string, is_prev_char_bracket} do
# Toggle if we are within quotes - operators within quotes are being ignored
{direction, "\"", _, _} ->
{direction, new_string <> character, not is_within_quoted_string,
not is_within_quoted_string}
# Encounter bracket outside quotes while is_prev_char_bracket == false: append & is_prev_char_bracket = true
{"LTR", "(", false, _} ->
{direction, new_string <> character, false, true}
{"RTL", ")", false, _} ->
{direction, new_string <> character, false, true}
# Encounter whitespace outside of quotes while is_prev_char_bracket == true: ignore & is_prev_char_bracket = true
{direction, " ", false, true} ->
{direction, new_string, false, true}
# Encounter whitespace outside of quotes while is_prev_char_bracket == false: append & is_prev_char_bracket = false
{direction, " ", false, false} ->
{direction, new_string <> character, false, false}
# Encounter any other character, outside of quotes: ignore & is_prev_char_bracket = false
{direction, _, false, _} ->
{direction, new_string <> character, false, false}
# Encounter any other character, inside of quotes: append & is_prev_char_bracket = false
{direction, _, true, _} ->
{direction, new_string <> character, true, false}
end
end
# TODO: Do not call String.graphemes/1 String.reverse/1 multiple times but work on list of char and benchmark with https://hexdocs.pm/benchee/readme.html
string
|> String.graphemes()
|> Enum.reduce({"LTR", "", false, false}, reducer_fn)
|> then(fn {_direction, new_string, _is_within_quoted_string, _is_prev_char_bracket} ->
new_string
end)
|> String.reverse()
|> String.graphemes()
|> Enum.reduce({"RTL", "", false, false}, reducer_fn)
|> then(fn {_direction, new_string, _is_within_quoted_string, _is_prev_char_bracket} ->
new_string
end)
|> String.reverse()
end
end
inputs = [
# "",
# " ",
# # (null, NUL, \0, ^@), originally intended to be an ignored character, but now used by many programming languages including C to mark the end of a string.
# List.to_string([0]),
# # (bell, BEL, \a, ^G), which may cause the device to emit a warning such as a bell or beep sound or the screen flashing.
# List.to_string([7]),
# # (backspace, BS, \b, ^H), may overprint the previous character.
# List.to_string([8]),
# # (horizontal tab, HT, \t, ^I), moves the printing position right to the next tab stop.
# List.to_string([9]),
# # (line feed, LF, \n, ^J), moves the print head down one line, or to the left edge and down. Used as the end of line marker in most UNIX systems and variants.
# List.to_string([10]),
# # (vertical tab, VT, \v, ^K), vertical tabulation.
# List.to_string([11]),
# # (form feed, FF, \f, ^L), to cause a printer to eject paper to the top of the next page, or a video terminal to clear the screen.
# List.to_string([12]),
# # (carriage return, CR, \r, ^M), moves the printing position to the start of the line, allowing overprinting. Used as the end of line marker in Classic Mac OS, OS-9, FLEX (and variants). A CR+LF pair is used by CP/M-80 and its derivatives including DOS and Windows, and by Application Layer protocols such as FTP, SMTP, and HTTP.
# List.to_string([13]),
# # (Control-Z, SUB, EOF, ^Z). Acts as an end-of-file for the Windows text-mode file i/o.
# List.to_string([26]),
# # (escape, ESC, \e (GCC only), ^[). Introduces an escape sequence.
# List.to_string([27]),
" f o o ",
"\"foo\"bar-\"without-quotes",
"💓❦♡♥❤💔éäüöÄÜÖß",
"Unicode\u00A0spaces",
" ) outside () ( \" inside \" outside \"in side\" out side \"inside\"outside\"",
"hello\" \" \"world",
"))))(())))((())\"(()(\")()()(()))))))))))((((((((((((((((((((((((((((((",
" outside \" inside inside \" outside ",
"foo\"bar\"quux",
"foo \" \" bar",
"a ( ( ( ( b",
"& asdasd & asdasd | & asdasd",
"a & | b",
"a & ( | b ) a && | ",
" & | && || (||)(&&) a (\"asdasd\)asdas & z\" ||) & a",
" (a ||) & a",
"csv & (parse | parsing) & (\"library for Elixir\")",
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque vel fringilla sapien. Praesent tincidunt convallis sollicitudin. Sed quis feugiat quam, eget congue sem. Mauris at lacinia metus, ac luctus mauris. In eget nisi at tortor accumsan finibus nec ac neque. Sed tincidunt leo vitae libero scelerisque sagittis. Phasellus non tincidunt mi. Mauris sollicitudin placerat tincidunt. Vivamus gravida sem euismod ex ullamcorper facilisis. Maecenas nec mollis neque. Vivamus auctor pharetra volutpat. Cras placerat euismod ligula, ac malesuada lorem pharetra et. Proin arcu velit, volutpat non ultricies eget, lacinia eget arcu. In et congue ante. Ut commodo pharetra felis, sit amet dapibus mauris efficitur eu. Curabitur mollis congue est a semper. Donec pulvinar ullamcorper massa, et blandit lectus pellentesque eu. Fusce commodo ornare lectus, in lacinia nulla auctor efficitur. Maecenas aliquet vitae sem a hendrerit. Cras eu massa arcu. Vestibulum rhoncus ullamcorper quam. Vivamus dictum vel justo at tempor. Curabitur sed magna eget orci porta tincidunt et ut erat. Nam volutpat non enim non efficitur. Quisque nec elit a elit egestas mollis eget eu ante. Donec tempus urna ut dapibus finibus. Curabitur at lorem vitae ligula viverra dapibus at id enim. Maecenas ut nisl vel nunc lacinia consectetur at ac mauris. Nunc ut lobortis ligula, sed pretium justo. Nunc massa risus, tincidunt et tincidunt in, vehicula id tortor. Phasellus id est eget eros eleifend facilisis. Aliquam erat volutpat. Vivamus tristique libero id malesuada bibendum. Mauris congue rutrum nisi vel dictum. Nunc urna turpis, lacinia quis felis ac, luctus laoreet nunc. Aenean efficitur nibh a felis euismod, quis tempor erat finibus. Maecenas odio ipsum, commodo nec dignissim vel, malesuada at nisl. Donec feugiat mollis ex, et sodales magna vehicula sed. Etiam ut dolor felis. Proin et nisl auctor, volutpat nisl id, sodales nisl. Donec lorem magna, semper quis lacus venenatis, tristique tincidunt eros. Donec non semper orci. Aliquam ut eleifend ligula. Maecenas sit amet justo eu ligula porttitor suscipit fringilla accumsan nisl. Quisque egestas aliquam porttitor. Duis tempus efficitur bibendum. Quisque mollis dolor tempor, pellentesque nisl vitae, efficitur est. Vestibulum commodo ex eu orci scelerisque, et viverra sapien tristique. Nam imperdiet sapien sit amet odio interdum pretium. Vivamus condimentum lectus augue, ut consequat massa sodales sit amet. Donec ornare metus ante, non hendrerit lorem volutpat et. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Duis sit amet orci eget enim malesuada interdum in ac ipsum. Ut sit amet tristique eros. Morbi mi metus, viverra et eros vel, ullamcorper scelerisque tortor. Mauris tempus a leo sed sagittis. Nulla sit amet nisi in felis tincidunt tristique. Duis nec blandit nunc. Fusce placerat bibendum nisl, vel commodo neque auctor pharetra. Vivamus at neque ante. Etiam pulvinar nisl diam. Mauris tellus elit, aliquet ut nisl non, aliquet faucibus dolor. Proin suscipit dictum erat congue convallis. Sed luctus, est vitae porttitor accumsan, elit nulla scelerisque felis, sit amet dignissim nisl sem in tortor. Etiam at leo commodo, scelerisque enim at, bibendum lectus.",
"Lorem ipsum dolor sit amet, foo: consectetur adipiscing elit. Pellentesque vel fringilla sapien. bar: Praesent tincidunt convallis sollicitudin. Sed quis feugiat quam, eget congue sem. Mauris at lacinia metus, ac luctus mauris. In eget nisi at tortor accumsan finibus nec ac neque. Sed tincidunt leo vitae libero scelerisque sagittis. Phasellus non tincidunt mi. Mauris sollicitudin placerat tincidunt. Vivamus gravida sem euismod ex ullamcorper facilisis. Maecenas nec mollis neque. Vivamus auctor pharetra volutpat. Cras placerat euismod ligula, ac malesuada lorem pharetra et. Proin arcu velit, volutpat non ultricies eget, lacinia eget arcu. In et congue ante. Ut commodo pharetra felis, sit amet dapibus mauris efficitur eu. Curabitur mollis congue est a semper. Donec pulvinar ullamcorper massa, et blandit lectus pellentesque eu. Fusce commodo ornare lectus, in lacinia nulla auctor efficitur. Maecenas aliquet vitae sem a hendrerit. Cras eu massa arcu. Vestibulum rhoncus ullamcorper quam. Vivamus dictum vel justo at tempor. Curabitur sed magna eget orci porta tincidunt et ut erat. Nam volutpat non enim non efficitur. Quisque nec elit a elit egestas mollis eget eu ante. Donec tempus urna ut dapibus finibus. Curabitur at lorem vitae ligula viverra dapibus at id enim. Maecenas ut nisl vel nunc lacinia consectetur at ac mauris. Nunc ut lobortis ligula, sed pretium justo. Nunc massa risus, tincidunt et tincidunt in, vehicula id tortor. Phasellus id est eget eros eleifend facilisis. Aliquam erat volutpat. Vivamus tristique libero id malesuada bibendum. Mauris congue rutrum nisi vel dictum. Nunc urna turpis, lacinia quis felis ac, luctus laoreet nunc. Aenean efficitur nibh a felis euismod, quis tempor erat finibus. Maecenas odio ipsum, commodo nec dignissim vel, malesuada at nisl. Donec feugiat mollis ex, et sodales magna vehicula sed. Etiam ut dolor felis. Proin et nisl auctor, volutpat nisl id, sodales nisl. Donec lorem magna, semper quis lacus venenatis, tristique tincidunt eros. Donec non semper orci. Aliquam ut eleifend ligula. Maecenas sit amet justo eu ligula porttitor suscipit fringilla accumsan nisl. Quisque egestas aliquam porttitor. Duis tempus efficitur bibendum. Quisque mollis dolor tempor, pellentesque nisl vitae, efficitur est. Vestibulum commodo ex eu orci scelerisque, et viverra sapien tristique. Nam imperdiet sapien sit amet odio interdum pretium. Vivamus condimentum lectus augue, ut consequat massa sodales sit amet. Donec ornare metus ante, non hendrerit lorem volutpat et. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Duis sit amet orci eget enim malesuada interdum in ac ipsum. Ut sit amet tristique eros. Morbi mi metus, viverra et eros vel, ullamcorper scelerisque tortor. Mauris tempus a leo sed sagittis. Nulla sit amet nisi in felis tincidunt tristique. Duis nec blandit nunc. Fusce placerat bibendum nisl, vel commodo neque auctor pharetra. Vivamus at neque ante. Etiam pulvinar nisl diam. Mauris tellus elit, aliquet ut nisl non, aliquet faucibus dolor. Proin suscipit dictum erat congue convallis. Sed luctus, est vitae porttitor accumsan, elit nulla scelerisque felis, sit amet dignissim nisl sem in tortor. Etiam at leo commodo, scelerisque enim at, bibendum lectus.",
]
inputs
|> Enum.with_index()
|> Enum.each(fn {input, idx} ->
("input " <> (idx |> Integer.to_string() |> String.pad_leading(2, "0")) <> " => ")
# <> Enum.at(inputs, idx) <> "\n "
|> IO.write()
input
|> SearchInputSanitizer.remove_ascii_control_characters()
# |> SearchInputSanitizer.space_pad_double_quotes()
|> SearchInputSanitizer.remove_surplus_whitespaces()
|> SearchInputSanitizer.take_graphemes_at_max_bytes(200)
|> SearchInputSanitizer.balance_double_quotes()
|> SearchInputSanitizer.balance_non_quoted_round_brackets(1)
|> SearchInputSanitizer.balance_logical_operators()
|> SearchInputSanitizer.remove_empty_round_brackets(1)
# Cosmetics - can be entirely omitted if the query is build via the parser
|> SearchInputSanitizer.remove_inner_white_space_padding_within_quoted_sub_strings()
|> SearchInputSanitizer.remove_inner_white_space_padding_within_round_brackets()
|> SearchInputSanitizer.remove_surplus_whitespaces()
# Output
|> then(fn value ->
# if is_binary(value), do: IO.puts(value), else: IO.inspect(value)
IO.inspect(value)
end)
end)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment