7-fl/aw2.erl

## aw2.erl
-module(aw2).
-compile(export_all).
-include_lib("eunit/include/eunit.hrl").

% I put the code for converting the line numbers into runs in another module
% called runs, and I call the function runs:runs() from this module.

% Variable names:
%    Is => Indexes, I => Index
%    I => {"word", [1, 3, 7, 9]}
%    Ws => Words
%    W => Word

index_file(FileName) ->
    Lines = index:get_file_contents(FileName), %From index.erl provided for us.
    NormLines = norm_lines(Lines),
    Indexes = index_lines(NormLines),
    io:format("~p~n", [prettify(Indexes)]).

%------------

prettify_test() ->
    [ {"bye", [{1,4}, {7,8}, {10,10}] },
      {"hi", [{1,3}, {5,5}] }
    ] = prettify([ {"hi", [3,5,1,2]}, {"bye", [3,4,3,1,2, 8,7, 10]} ]),
    all_tests_passed.

prettify(Is) ->
    prettify(Is, []).

prettify([], Acc) ->
    Acc;  %No need to reverse because they aren't ordered anyway.
prettify([{Word, LineNums} | Is], Acc) ->
    prettify(Is, [{Word, runs:runs(LineNums)} | Acc]).


%----------

index_lines_test() ->
    [] = index_lines([]),
    [{"hi", [1]}, {"bye",[1]} ] = index_lines(["hi bye"]),
    [{"hi", [2, 1]}, {"bye",[2, 1]} ] = index_lines(["hi bye", "hi bye"]),
    [{"abc", [2]}, {"hi", [1]}, {"bye",[1]}, {"def", [2]} ] =
                index_lines(["hi bye", "abc def"]),  %The order gets a little mixed
                                                     %up, but that's my algorithm!
    all_tests_passed.

index_lines(Ls) ->
    index_lines(Ls, [], 1).

index_lines([], Is, _) ->
    Is;
index_lines([""|Ls], Is, LineNum) -> %Increment LineNum for blank lines.
    index_lines(Ls, Is, LineNum+1);
index_lines([L|Ls], Is, LineNum) -> %Is => Indexes; I => {"word", [1,2]}
    Words = split(L),
    NewIs = index_words(Words, Is, LineNum),
    index_lines(Ls, NewIs, LineNum+1).

%------------

index_words_test() ->                  %Ws => Word list => ["abc", "def"]
    %                Ws  Is  LineNum   %Is => Indexes; I => {"word", [2, 1]}
    %                V   v   V         %LineNum => 1
    [] = index_words([], [], 1),
    [{"abc", [1]}] = index_words(["abc"], [], 1),
    [{"def", [2]}, {"abc", [2]} ] = index_words(["abc", "def"], [], 2),
    [{"abc", [2]}, {"def", [1,2]} ] = index_words(["abc"], [{"def", [1,2]}], 2),
    all_tests_passed.

index_words([], Is, _) ->  %Is => Indexes, I => {"word", [4, 3, 1]}
    Is;
index_words([W|Ws], Is, LineNum) ->  %W => Word, Ws => Words, Is => Indexes, I => {"word", [3, 2]}
    {I, OtherIs} = find_index(W, Is),  %If W not found in Is, then I will be the default: {W, []}.
    {_, LineNums} = I,
    NewI = {W, [LineNum|LineNums] },
    index_words(Ws, [NewI|OtherIs], LineNum).  %Add found I back to Is.

%---------

find_index_test() ->
    { {"pear", []}, [] } = find_index("pear", [ {"pear", []} ] ),
    { {"pear", [1, 2, 3]}, []} = find_index("pear", [ {"pear", [1, 2, 3]} ] ),

    { {"pear", [4, 2]}, [{"beet", [4, 1]}, {"apple", [3, 1]}]} = find_index(
    "pear",
    [{"apple", [3, 1]}, {"pear", [4, 2]}, {"beet", [4, 1]}]),
    all_tests_passed.

find_index(W, Is) ->  %W => Word, Is => Indexes; I => {"word", [1, 2]}
    find_index(W, Is, [], {W, []} ).  %{W, []} => {"the", []} => Default index

find_index(_, [], OtherIs, FoundI) ->
    {FoundI, OtherIs};
find_index(W, [{W, _}=I | Is ], OtherIs, _) -> %W matches an index in Is.
    find_index(W, Is, OtherIs, I);  %Drop index from OtherIs; store index in FoundI
find_index(W, [I|Is], OtherIs, FoundI) ->
    find_index(W, Is, [I|OtherIs], FoundI). %Add index to NewIs.

%------------

split_test() ->
    ["ab"] = split("ab"),
    ["cd", "ab"] = split("ab cd"),
    ["aa", "aa"] = split("aa aa "), %To test bug fix 2.
    ["def", "abc"] = split("abc def"),
    ["def", "abc"] = split("abc def"),
    ["def", "abc"] = split("abc  def"),  %Two spces created by normalizing "ab -- cd".
    all_tests_passed.

split(Line) ->
    split(Line, [], []).

%-------bug fix 2---------
split([], [], Words) ->
    Words;
%-------------------------
split([], Word, Words) ->
    [reverse(Word) | Words]; %Whatever Word contains at the end is another Word.
                             %No!!! If the list is empty, I am adding a blank string
%                            %to the Words list. Woe is me! Added previous clause to prevent that.
% The next split() clause handles the quirk that happens when I normalize:
%     consecrate -- we
% and get:
%     consecrate  we
%Note there are two spaces separating those words, which means that the next word after
%"consecrate" would be " we", which would be considered a different word than "we", thereby
%creating  two indexes.  The following clause fixes that: when a space is encountered and
%the list that accumulates a Word is empty, then skip the space.
%-----------bug fix 1---------------
split([X|Xs], [], Words) when X =:= 32 -> %Found space at start of word because Word
    split(Xs, [], Words);                 %is empty; skip it.
%-----------------------------------

split([X|Xs], Word, Words) when X =:= 32 -> %Found a space signaling end of word, so
    split(Xs, [], [reverse(Word) | Words]); %add Word to Words; start a new Word.
split([X|Xs], Word, Words) ->
    split(Xs, [X|Word], Words).

%------------

norm_lines_test() ->
    ["ab"] = norm_lines(["Ab!"]),

    ["ab", "", "us  that"] = norm_lines(
        ["Ab", "", "us -- that"]),  %Bug here: normalize() => "us  that" (with two spaces)

    ["hello world", "goodbye mars", ""] = norm_lines(
        ["Hello, world!", "Goodbye Mars.", ""]),

    all_tests_passed.

norm_lines(Lines) ->
    norm_lines(Lines, []).

norm_lines([], Acc) ->
    reverse(Acc);
norm_lines([L|Ls], Acc) ->
    norm_lines(Ls, [normalize(L) | Acc] ).

%-------------

reverse_test() ->
    [] = reverse([]),
    [2, 1] = reverse([1, 2]),
    [1, 2, 3] = reverse([3, 2, 1]),
    all_tests_passed.

reverse(L) -> reverse(L, []).

reverse([], Acc) ->
    Acc;
reverse([X|Xs], Acc) ->
    reverse(Xs, [X|Acc]).

%-----------------

normalize_test() ->
    "" = normalize(""),
    "a" = normalize("a"),
    "ab" = normalize("ab"),
    "ab" = normalize("aB"),
    "ab" = normalize("AB"),
    "ab" = normalize("1A+B&2"),
    "ab" = normalize("{%A|B.@}"),
    "ab" = normalize("?!*a';(<}B{>)-+"),
    "a b" = normalize("A b!"),
    all_tests_passed.


normalize(L) ->
    normalize(L, []).

normalize([], Acc) ->
    reverse(Acc);
normalize([X|Xs], Acc) when X >= $A, X =< $Z ->  %Caps...
    normalize(Xs, [X+32|Acc]);  %convert to lower case
normalize([X|Xs], Acc) when X >= $a, X =< $z; X=:=32 -> %Lower case and spaces..
    normalize(Xs, [X|Acc]);   %no changes.
normalize([_|Xs], Acc) ->  %Anything else...
    normalize(Xs, Acc).  %skip 'em.
	-module(aw2).
	-compile(export_all).
	-include_lib("eunit/include/eunit.hrl").

	% I put the code for converting the line numbers into runs in another module
	% called runs, and I call the function runs:runs() from this module.

	% Variable names:
	% Is => Indexes, I => Index
	% I => {"word", [1, 3, 7, 9]}
	% Ws => Words
	% W => Word

	index_file(FileName) ->
	Lines = index:get_file_contents(FileName), %From index.erl provided for us.
	NormLines = norm_lines(Lines),
	Indexes = index_lines(NormLines),
	io:format("~p~n", [prettify(Indexes)]).

	%------------

	prettify_test() ->
	[ {"bye", [{1,4}, {7,8}, {10,10}] },
	{"hi", [{1,3}, {5,5}] }
	] = prettify([ {"hi", [3,5,1,2]}, {"bye", [3,4,3,1,2, 8,7, 10]} ]),
	all_tests_passed.

	prettify(Is) ->
	prettify(Is, []).

	prettify([], Acc) ->
	Acc; %No need to reverse because they aren't ordered anyway.
	prettify([{Word, LineNums} \| Is], Acc) ->
	prettify(Is, [{Word, runs:runs(LineNums)} \| Acc]).


	%----------

	index_lines_test() ->
	[] = index_lines([]),
	[{"hi", [1]}, {"bye",[1]} ] = index_lines(["hi bye"]),
	[{"hi", [2, 1]}, {"bye",[2, 1]} ] = index_lines(["hi bye", "hi bye"]),
	[{"abc", [2]}, {"hi", [1]}, {"bye",[1]}, {"def", [2]} ] =
	index_lines(["hi bye", "abc def"]), %The order gets a little mixed
	%up, but that's my algorithm!
	all_tests_passed.

	index_lines(Ls) ->
	index_lines(Ls, [], 1).

	index_lines([], Is, _) ->
	Is;
	index_lines([""\|Ls], Is, LineNum) -> %Increment LineNum for blank lines.
	index_lines(Ls, Is, LineNum+1);
	index_lines([L\|Ls], Is, LineNum) -> %Is => Indexes; I => {"word", [1,2]}
	Words = split(L),
	NewIs = index_words(Words, Is, LineNum),
	index_lines(Ls, NewIs, LineNum+1).

	%------------

	index_words_test() -> %Ws => Word list => ["abc", "def"]
	% Ws Is LineNum %Is => Indexes; I => {"word", [2, 1]}
	% V v V %LineNum => 1
	[] = index_words([], [], 1),
	[{"abc", [1]}] = index_words(["abc"], [], 1),
	[{"def", [2]}, {"abc", [2]} ] = index_words(["abc", "def"], [], 2),
	[{"abc", [2]}, {"def", [1,2]} ] = index_words(["abc"], [{"def", [1,2]}], 2),
	all_tests_passed.

	index_words([], Is, _) -> %Is => Indexes, I => {"word", [4, 3, 1]}
	Is;
	index_words([W\|Ws], Is, LineNum) -> %W => Word, Ws => Words, Is => Indexes, I => {"word", [3, 2]}
	{I, OtherIs} = find_index(W, Is), %If W not found in Is, then I will be the default: {W, []}.
	{_, LineNums} = I,
	NewI = {W, [LineNum\|LineNums] },
	index_words(Ws, [NewI\|OtherIs], LineNum). %Add found I back to Is.

	%---------

	find_index_test() ->
	{ {"pear", []}, [] } = find_index("pear", [ {"pear", []} ] ),
	{ {"pear", [1, 2, 3]}, []} = find_index("pear", [ {"pear", [1, 2, 3]} ] ),

	{ {"pear", [4, 2]}, [{"beet", [4, 1]}, {"apple", [3, 1]}]} = find_index(
	"pear",
	[{"apple", [3, 1]}, {"pear", [4, 2]}, {"beet", [4, 1]}]),
	all_tests_passed.

	find_index(W, Is) -> %W => Word, Is => Indexes; I => {"word", [1, 2]}
	find_index(W, Is, [], {W, []} ). %{W, []} => {"the", []} => Default index

	find_index(_, [], OtherIs, FoundI) ->
	{FoundI, OtherIs};
	find_index(W, [{W, _}=I \| Is ], OtherIs, _) -> %W matches an index in Is.
	find_index(W, Is, OtherIs, I); %Drop index from OtherIs; store index in FoundI
	find_index(W, [I\|Is], OtherIs, FoundI) ->
	find_index(W, Is, [I\|OtherIs], FoundI). %Add index to NewIs.

	%------------

	split_test() ->
	["ab"] = split("ab"),
	["cd", "ab"] = split("ab cd"),
	["aa", "aa"] = split("aa aa "), %To test bug fix 2.
	["def", "abc"] = split("abc def"),
	["def", "abc"] = split("abc def"),
	["def", "abc"] = split("abc def"), %Two spces created by normalizing "ab -- cd".
	all_tests_passed.

	split(Line) ->
	split(Line, [], []).

	%-------bug fix 2---------
	split([], [], Words) ->
	Words;
	%-------------------------
	split([], Word, Words) ->
	[reverse(Word) \| Words]; %Whatever Word contains at the end is another Word.
	%No!!! If the list is empty, I am adding a blank string
	% %to the Words list. Woe is me! Added previous clause to prevent that.
	% The next split() clause handles the quirk that happens when I normalize:
	% consecrate -- we
	% and get:
	% consecrate we
	%Note there are two spaces separating those words, which means that the next word after
	%"consecrate" would be " we", which would be considered a different word than "we", thereby
	%creating two indexes. The following clause fixes that: when a space is encountered and
	%the list that accumulates a Word is empty, then skip the space.
	%-----------bug fix 1---------------
	split([X\|Xs], [], Words) when X =:= 32 -> %Found space at start of word because Word
	split(Xs, [], Words); %is empty; skip it.
	%-----------------------------------

	split([X\|Xs], Word, Words) when X =:= 32 -> %Found a space signaling end of word, so
	split(Xs, [], [reverse(Word) \| Words]); %add Word to Words; start a new Word.
	split([X\|Xs], Word, Words) ->
	split(Xs, [X\|Word], Words).

	%------------

	norm_lines_test() ->
	["ab"] = norm_lines(["Ab!"]),

	["ab", "", "us that"] = norm_lines(
	["Ab", "", "us -- that"]), %Bug here: normalize() => "us that" (with two spaces)

	["hello world", "goodbye mars", ""] = norm_lines(
	["Hello, world!", "Goodbye Mars.", ""]),

	all_tests_passed.

	norm_lines(Lines) ->
	norm_lines(Lines, []).

	norm_lines([], Acc) ->
	reverse(Acc);
	norm_lines([L\|Ls], Acc) ->
	norm_lines(Ls, [normalize(L) \| Acc] ).

	%-------------

	reverse_test() ->
	[] = reverse([]),
	[2, 1] = reverse([1, 2]),
	[1, 2, 3] = reverse([3, 2, 1]),
	all_tests_passed.

	reverse(L) -> reverse(L, []).

	reverse([], Acc) ->
	Acc;
	reverse([X\|Xs], Acc) ->
	reverse(Xs, [X\|Acc]).

	%-----------------

	normalize_test() ->
	"" = normalize(""),
	"a" = normalize("a"),
	"ab" = normalize("ab"),
	"ab" = normalize("aB"),
	"ab" = normalize("AB"),
	"ab" = normalize("1A+B&2"),
	"ab" = normalize("{%A\|B.@}"),
	"ab" = normalize("?!*a';(<}B{>)-+"),
	"a b" = normalize("A b!"),
	all_tests_passed.


	normalize(L) ->
	normalize(L, []).

	normalize([], Acc) ->
	reverse(Acc);
	normalize([X\|Xs], Acc) when X >= $A, X =< $Z -> %Caps...
	normalize(Xs, [X+32\|Acc]); %convert to lower case
	normalize([X\|Xs], Acc) when X >= $a, X =< $z; X=:=32 -> %Lower case and spaces..
	normalize(Xs, [X\|Acc]); %no changes.
	normalize([_\|Xs], Acc) -> %Anything else...
	normalize(Xs, Acc). %skip 'em.