% I put the code for converting the line numbers into runs in another module
% called runs, and I call the function runs:runs() from this module.
% Variable names:
% Is => Indexes, I => Index
% I => {"word", [1, 3, 7, 9]}
% Ws => Words
% W => Word
index_file(FileName) ->
Lines = index:get_file_contents(FileName), %From index.erl provided for us.
NormLines = norm_lines(Lines),
Indexes = index_lines(NormLines),
io:format("~p~n", [prettify(Indexes)]).
prettify_test() ->
[ {"bye", [{1,4}, {7,8}, {10,10}] },
{"hi", [{1,3}, {5,5}] }
] = prettify([ {"hi", [3,5,1,2]}, {"bye", [3,4,3,1,2, 8,7, 10]} ]),
prettify(Is) ->
prettify(Is, []).
prettify([], Acc) ->
Acc; %No need to reverse because they aren't ordered anyway.
prettify([{Word, LineNums} | Is], Acc) ->
prettify(Is, [{Word, runs:runs(LineNums)} | Acc]).
index_lines_test() ->
[] = index_lines([]),
[{"hi", [1]}, {"bye",[1]} ] = index_lines(["hi bye"]),
[{"hi", [2, 1]}, {"bye",[2, 1]} ] = index_lines(["hi bye", "hi bye"]),
[{"abc", [2]}, {"hi", [1]}, {"bye",[1]}, {"def", [2]} ] =
index_lines(["hi bye", "abc def"]), %The order gets a little mixed
%up, but that's my algorithm!
index_lines(Ls) ->
index_lines(Ls, [], 1).
index_lines([], Is, _) ->
index_lines([""|Ls], Is, LineNum) -> %Increment LineNum for blank lines.
index_lines(Ls, Is, LineNum+1);
index_lines([L|Ls], Is, LineNum) -> %Is => Indexes; I => {"word", [1,2]}
Words = split(L),
NewIs = index_words(Words, Is, LineNum),
index_lines(Ls, NewIs, LineNum+1).
index_words_test() -> %Ws => Word list => ["abc", "def"]
% Ws Is LineNum %Is => Indexes; I => {"word", [2, 1]}
% V v V %LineNum => 1
[] = index_words([], [], 1),
[{"abc", [1]}] = index_words(["abc"], [], 1),
[{"def", [2]}, {"abc", [2]} ] = index_words(["abc", "def"], [], 2),
[{"abc", [2]}, {"def", [1,2]} ] = index_words(["abc"], [{"def", [1,2]}], 2),
index_words([], Is, _) -> %Is => Indexes, I => {"word", [4, 3, 1]}
index_words([W|Ws], Is, LineNum) -> %W => Word, Ws => Words, Is => Indexes, I => {"word", [3, 2]}
{I, OtherIs} = find_index(W, Is), %If W not found in Is, then I will be the default: {W, []}.
{_, LineNums} = I,
NewI = {W, [LineNum|LineNums] },
index_words(Ws, [NewI|OtherIs], LineNum). %Add found I back to Is.
find_index_test() ->
{ {"pear", []}, [] } = find_index("pear", [ {"pear", []} ] ),
{ {"pear", [1, 2, 3]}, []} = find_index("pear", [ {"pear", [1, 2, 3]} ] ),
{ {"pear", [4, 2]}, [{"beet", [4, 1]}, {"apple", [3, 1]}]} = find_index(
[{"apple", [3, 1]}, {"pear", [4, 2]}, {"beet", [4, 1]}]),
find_index(W, Is) -> %W => Word, Is => Indexes; I => {"word", [1, 2]}
find_index(W, Is, [], {W, []} ). %{W, []} => {"the", []} => Default index
find_index(_, [], OtherIs, FoundI) ->
{FoundI, OtherIs};
find_index(W, [{W, _}=I | Is ], OtherIs, _) -> %W matches an index in Is.
find_index(W, Is, OtherIs, I); %Drop index from OtherIs; store index in FoundI
find_index(W, [I|Is], OtherIs, FoundI) ->
find_index(W, Is, [I|OtherIs], FoundI). %Add index to NewIs.
split_test() ->
["ab"] = split("ab"),
["cd", "ab"] = split("ab cd"),
["aa", "aa"] = split("aa aa "), %To test bug fix 2.
["def", "abc"] = split("abc def"),
["def", "abc"] = split("abc def"),
["def", "abc"] = split("abc def"), %Two spces created by normalizing "ab -- cd".
split(Line) ->
split(Line, [], []).
%-------bug fix 2---------
split([], [], Words) ->
split([], Word, Words) ->
[reverse(Word) | Words]; %Whatever Word contains at the end is another Word.
%No!!! If the list is empty, I am adding a blank string
% %to the Words list. Woe is me! Added previous clause to prevent that.
% The next split() clause handles the quirk that happens when I normalize:
% consecrate -- we
% and get:
% consecrate we
%Note there are two spaces separating those words, which means that the next word after
%"consecrate" would be " we", which would be considered a different word than "we", thereby
%creating two indexes. The following clause fixes that: when a space is encountered and
%the list that accumulates a Word is empty, then skip the space.
%-----------bug fix 1---------------
split([X|Xs], [], Words) when X =:= 32 -> %Found space at start of word because Word
split(Xs, [], Words); %is empty; skip it.
split([X|Xs], Word, Words) when X =:= 32 -> %Found a space signaling end of word, so
split(Xs, [], [reverse(Word) | Words]); %add Word to Words; start a new Word.
split([X|Xs], Word, Words) ->
split(Xs, [X|Word], Words).
norm_lines_test() ->
["ab"] = norm_lines(["Ab!"]),
["ab", "", "us that"] = norm_lines(
["Ab", "", "us -- that"]), %Bug here: normalize() => "us that" (with two spaces)
["hello world", "goodbye mars", ""] = norm_lines(
["Hello, world!", "Goodbye Mars.", ""]),
norm_lines(Lines) ->
norm_lines(Lines, []).
norm_lines([], Acc) ->
norm_lines([L|Ls], Acc) ->
norm_lines(Ls, [normalize(L) | Acc] ).
reverse_test() ->
[] = reverse([]),
[2, 1] = reverse([1, 2]),
[1, 2, 3] = reverse([3, 2, 1]),
reverse(L) -> reverse(L, []).
reverse([], Acc) ->
reverse([X|Xs], Acc) ->
reverse(Xs, [X|Acc]).
normalize_test() ->
"" = normalize(""),
"a" = normalize("a"),
"ab" = normalize("ab"),
"ab" = normalize("aB"),
"ab" = normalize("AB"),
"ab" = normalize("1A+B&2"),
"ab" = normalize("{%A|B.@}"),
"ab" = normalize("?!*a';(<}B{>)-+"),
"a b" = normalize("A b!"),
normalize(L) ->
normalize(L, []).
normalize([], Acc) ->
normalize([X|Xs], Acc) when X >= $A, X =< $Z -> %Caps...
normalize(Xs, [X+32|Acc]); %convert to lower case
normalize([X|Xs], Acc) when X >= $a, X =< $z; X=:=32 -> %Lower case and spaces..
normalize(Xs, [X|Acc]); %no changes.
normalize([_|Xs], Acc) -> %Anything else...
normalize(Xs, Acc). %skip 'em.
