Skip to content

Instantly share code, notes, and snippets.

@rondy
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rondy/1f62aca8958df5ce1fc0 to your computer and use it in GitHub Desktop.
Save rondy/1f62aca8958df5ce1fc0 to your computer and use it in GitHub Desktop.
ExUnit.start
defmodule SearchTokenizerTest do
use ExUnit.Case
test "tokenizes an empty text" do
text = ""
assert Search.tokenize(text) == []
end
test "tokenizes a single character" do
text = "a"
assert Search.tokenize(text) == [{"a", 0}]
end
test "tokenizes a word" do
text = "jambu"
assert Search.tokenize(text) == [{"jambu", 0}]
end
test "tokenizes two words" do
text = "jambu treme"
assert Search.tokenize(text) == [{"jambu", 0 }, {"treme", 6}]
end
test "tokenizes a sentence with many words" do
text = "se você quiser saber o que a jamburana faz"
assert Search.tokenize(text) == [
{"se", 0},
{"você", 3},
{"quiser", 8},
{"saber", 15},
{"o", 21},
{"que", 23},
{"a", 27},
{"jamburana", 29},
{"faz", 39}
]
end
end
defmodule Search do
@separator " "
@initial_buffer ""
@initial_tokens_list []
@initial_index 0
def tokenize(text) do
String.codepoints(text) |>
find_tokens(@initial_buffer, @initial_tokens_list, @initial_index)
end
# Empty characters list.
defp find_tokens([], @initial_buffer, @initial_tokens_list, @initial_index) do
[]
end
# Token found.
defp find_tokens([@separator | tail], buffer, tokens_list, index) do
updated_tokens_list = prepare_tokens_list_from(buffer, index, tokens_list)
reset_buffer = @initial_buffer
find_tokens(tail, reset_buffer, updated_tokens_list, index + 1)
end
# Last token found.
defp find_tokens([], buffer, tokens_list, index) do
final_tokens_list = prepare_tokens_list_from(buffer, index, tokens_list)
final_tokens_list |> Enum.reverse
end
# Consume the characters list.
defp find_tokens([h | tail], buffer, tokens_list, index) do
find_tokens(tail, h <> buffer, tokens_list, index + 1)
end
defp prepare_tokens_list_from(buffer, index, current_tokens_list) do
token = token_for(buffer, index)
[token | current_tokens_list]
end
defp token_for(buffer, index) do
token_found = prepare_token_found(buffer)
token_position = calculate_token_position(token_found, index)
{ token_found, token_position }
end
defp prepare_token_found(buffer) do
String.reverse(buffer)
end
defp calculate_token_position(token, index) do
index - String.length(token)
end
end
@rcillo
Copy link

rcillo commented Oct 4, 2014

Dahora Rondi.

Pega essa bola com efeito aqui :trollface:

test "skips multiple whitespaces" do
  text = " jambu  treme "
  assert Search.tokenize(text) == [{"jambu", 1 }, {"treme", 8}]
end

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment