mjstrasser/index.erl

## index.erl
-module(index).
-export([get_file_contents/1,show_file_contents/1,
        index_file/1, word_index/1]).
-include_lib("eunit/include/eunit.hrl").


%% Index the words in the specified file.
index_file(Filename) ->
  word_index(get_file_contents(Filename)).

%% Construct an index with the lines that each word appears in the supplied
%% text as a list of lines of text.
word_index(Lines) ->
  % RawIndex has entries for every word occurrence with duplicates
  % in reverse line order.
  RawIndex = index_lines(Lines, 1, []),
  % Consolidate the line numbers in the raw index.
  consolidate_index(RawIndex).

% Simple test.
word_index_test() ->
  ?assertEqual([
      {"this", [{1,2}]},
      {"is", [{1,3}]},
      {"fun", [{1}]},
      {"clever", [{2}]},
      {"that", [{3}]},
      {"good", [{3}]}
    ],
    word_index([
      "this is fun",
      "this is clever",
      "that is good",
      ""
    ])
  ).

%% Index the words in a list of lines of text.
index_lines([], _, Index) -> Index;
index_lines([Line|Rem], N, Index) ->
  Words = split_words(Line),
  index_lines(Rem, N+1, index_words(Words, N, Index)).

%% Index the words for a line of text line with supplied number.
%% The generated raw
index_words([], _, Index) -> Index;
index_words([Word|Rem], N, Index) ->
  index_words(Rem, N, add_word(Word, N, Index)).

index_words_test() ->
  ?assertEqual([], index_words([], 1, [])),
  Idx1 = [{"one",[{1}]}],
  ?assertEqual(Idx1, index_words(["one"], 1, [])),
  Idx2 = [{"one",[{2},{1}]},{"two",[{2}]}],
  ?assertEqual(Idx2, index_words(["one","two"], 2, Idx1)),
  ?assertEqual(Idx2, index_words([], 3, Idx2)),
  Idx4 = [{"one",[{4},{2},{1}]},{"two",[{2}]},{"four",[{4}]}],
  ?assertEqual(Idx4, index_words(["one","four"], 4, Idx2)),
  Idx5 = [{"one",[{5},{5},{4},{2},{1}]},{"two",[{5},{2}]},{"four",[{4}]}],
  ?assertEqual(Idx5, index_words(["one","two","one"], 5, Idx4)).

%% Add a word to the index. This function is not tail-recursive but that will
%% be OK unless there are lines containing very many words.
%%
%%   Index is empty: start with this word.
add_word(Word, N, []) -> [{Word, [{N}]}];
%%   Index contains this word: add line number.
add_word(Word, N, [{Word, Entries} | Rem]) ->
  [{Word, [{N} | Entries]} | Rem];
%%   Keep looking for this word in the index.
add_word(Word, N, [Idx|Rem]) -> [Idx | add_word(Word, N, Rem)].

add_word_test() ->
  Idx1 = [{"one",[{1}]}],
  ?assertEqual(Idx1, add_word("one", 1, [])),
  Idx2 = [{"one",[{2},{1}]}],
  ?assertEqual(Idx2, add_word("one", 2, Idx1)),
  Idx3 = [{"one",[{2},{1}]},{"two",[{2}]}],
  ?assertEqual(Idx3, add_word("two", 2, Idx2)).

%% Is the character a word character: A-Z or a-z.
%% (Simple definition for now.)
word_char(C) -> C >= $A andalso C =< $Z orelse C >= $a andalso C =< $z.

word_char_test() ->
  ?assert(word_char($A)),
  ?assert(word_char($K)),
  ?assert(word_char($z)),
  ?assertNot(word_char($/)),
  ?assertNot(word_char($-)),
  ?assertNot(word_char($,)).

%% Split the next word from the line, returning it and the remainder
%% as a 2-tuple.
next_word(Line) -> next_word(Line, []).
next_word([], Word) -> {lists:reverse(Word), ""};
next_word([Char|Chars], Word) ->
  case word_char(Char) of
    true -> next_word(Chars, [Char|Word]);
    false -> {lists:reverse(Word), Chars}
  end.

next_word_test() ->
  ?assertEqual({"", ""}, next_word("")),
  ?assertEqual({"solo", ""}, next_word("solo")),
  ?assertEqual({"Rhubarb", "is tasty"}, next_word("Rhubarb is tasty")).

%% Filter a list, returning only non-empty items.
non_empty(Xs) -> lists:reverse(non_empty(Xs, [])).
non_empty([], NonEmpty) -> NonEmpty;
non_empty([[]|Xs], NonEmpty) -> non_empty(Xs, NonEmpty);
non_empty([X|Xs], NonEmpty) -> non_empty(Xs, [X|NonEmpty]).

non_empty_test() ->
  ?assertEqual([], non_empty([[], []])),
  ?assertEqual(["hello", "there"], non_empty([[], "hello", [], [], "there"])).

%% Split a line of characters into a list of words.
split_words(Line) -> lists:reverse(non_empty(split_words(Line, []))).
split_words([], Words) -> Words;
split_words(Line, Words) ->
  {First, Rem} = next_word(Line),
  split_words(Rem, [First|Words]).

split_words_test() ->
  ?assertEqual([], split_words([])),
  ?assertEqual(["this", "and", "that", "black", "white"],
               split_words("this and that: black -- white!")).

%% Consolidate a raw index containing words with lists of line numbers
%% in reverse order. An example is:
%%   [{"one", [{2},{1}]}, {"two", [{3},{1}]}]
consolidate_index(RawIndex) ->
  lists:reverse(consolidate_index(RawIndex, [])).
consolidate_index([], Index) -> Index;
consolidate_index([{Word,DescNos}|Rem], Index) ->
  AscNos = lists:reverse(DescNos),
  consolidate_index(Rem, [{Word, consolidate_nums(AscNos)} | Index]).

consolidate_index_test() ->
  ?assertEqual([], consolidate_index([], [])),
  ?assertEqual([{"one",[{1}]}], consolidate_index([{"one",[{1}]}], [])),
  ?assertEqual([{"one",[{1,2}]},{"two",[{2}]}],
                consolidate_index([{"one",[{2},{1}]},{"two",[{2}]}])).

%% Consolidate a sorted list of 1-tuples of numbers:
%% - remove duplicates, thus {1},{1} -> {1}
%% - reduce consecutive numbers to 2-tuples with first and last numbers,
%%   thus {1},{2},{3},{4} -> {1,4}
%% - otherwise keep 1-tuples as they are.
consolidate_nums(Nums) ->
  lists:reverse(consolidate_nums(Nums, [])).

consolidate_nums([], Accum) ->
  Accum;
consolidate_nums([{N}], Accum) ->
  [{N} | Accum];
consolidate_nums([{N},{N}|Rem], Accum) ->
  consolidate_nums([{N}|Rem], Accum);
consolidate_nums([{N},{O}|Rem], Accum) when O == N+1 ->
  consolidate_nums([{N,O}|Rem], Accum);
consolidate_nums([{N},{O}|Rem], Accum) ->
  consolidate_nums([{O}|Rem], [{N} | Accum]);
consolidate_nums([{N,O},{O}|Rem], Accum) ->
  consolidate_nums([{N,O}|Rem], Accum);
consolidate_nums([{N,O},{P}|Rem], Accum) when P == O+1 ->
  consolidate_nums([{N,P}|Rem], Accum);
consolidate_nums([{N,O},{P}|Rem], Accum) ->
  consolidate_nums([{P}|Rem], [{N,O} | Accum]);
consolidate_nums([Nums|Rem], Accum) ->
  consolidate_nums(Rem, [Nums | Accum]).

consolidate_nums_test() ->
  ?assertEqual([], consolidate_nums([])),
  ?assertEqual([{1}], consolidate_nums([{1}])),
  ?assertEqual([{1}], consolidate_nums([{1},{1}])),
  ?assertEqual([{1},{3}], consolidate_nums([{1},{3}])),
  ?assertEqual([{1,2}], consolidate_nums([{1},{2}])),
  ?assertEqual([{1,2}], consolidate_nums([{1},{2},{2}])),
  ?assertEqual([{1,4}], consolidate_nums([{1},{2},{2},{3},{4},{4}])),
  ?assertEqual([{1,3},{5}], consolidate_nums([{1},{2},{3},{5}])),
  ?assertEqual([{1,3},{5},{7,9},{12}],
               consolidate_nums([{1},{2},{3},{5},{7},{8},{9},{12}])),
  % Numbers must be consecutive!
  ?assertEqual([{1},{3},{2}], consolidate_nums([{1},{3},{2}])).

%%%%%%%%%%%%%%%%% Supplied utility functions %%%%%%%%%%%%%%%

% Used to read a file into a list of lines.
% Example files available in:
%   gettysburg-address.txt (short)
%   dickens-christmas.txt  (long)


% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.

get_file_contents(Name) ->
    {ok,File} = file:open(Name,[read]),
    Rev = get_all_lines(File,[]),
    lists:reverse(Rev).

% Auxiliary function for get_file_contents.
% Not exported.

get_all_lines(File,Partial) ->
    case io:get_line(File,"") of
        eof -> file:close(File),
               Partial;
        Line -> {Strip,_} = lists:split(length(Line)-1,Line),
                get_all_lines(File,[Strip|Partial])
    end.

% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.

show_file_contents([L|Ls]) ->
    io:format("~s~n",[L]),
    show_file_contents(Ls);
show_file_contents([]) ->
    ok.
	-module(index).
	-export([get_file_contents/1,show_file_contents/1,
	index_file/1, word_index/1]).
	-include_lib("eunit/include/eunit.hrl").


	%% Index the words in the specified file.
	index_file(Filename) ->
	word_index(get_file_contents(Filename)).

	%% Construct an index with the lines that each word appears in the supplied
	%% text as a list of lines of text.
	word_index(Lines) ->
	% RawIndex has entries for every word occurrence with duplicates
	% in reverse line order.
	RawIndex = index_lines(Lines, 1, []),
	% Consolidate the line numbers in the raw index.
	consolidate_index(RawIndex).

	% Simple test.
	word_index_test() ->
	?assertEqual([
	{"this", [{1,2}]},
	{"is", [{1,3}]},
	{"fun", [{1}]},
	{"clever", [{2}]},
	{"that", [{3}]},
	{"good", [{3}]}
	],
	word_index([
	"this is fun",
	"this is clever",
	"that is good",
	""
	])
	).

	%% Index the words in a list of lines of text.
	index_lines([], _, Index) -> Index;
	index_lines([Line\|Rem], N, Index) ->
	Words = split_words(Line),
	index_lines(Rem, N+1, index_words(Words, N, Index)).

	%% Index the words for a line of text line with supplied number.
	%% The generated raw
	index_words([], _, Index) -> Index;
	index_words([Word\|Rem], N, Index) ->
	index_words(Rem, N, add_word(Word, N, Index)).

	index_words_test() ->
	?assertEqual([], index_words([], 1, [])),
	Idx1 = [{"one",[{1}]}],
	?assertEqual(Idx1, index_words(["one"], 1, [])),
	Idx2 = [{"one",[{2},{1}]},{"two",[{2}]}],
	?assertEqual(Idx2, index_words(["one","two"], 2, Idx1)),
	?assertEqual(Idx2, index_words([], 3, Idx2)),
	Idx4 = [{"one",[{4},{2},{1}]},{"two",[{2}]},{"four",[{4}]}],
	?assertEqual(Idx4, index_words(["one","four"], 4, Idx2)),
	Idx5 = [{"one",[{5},{5},{4},{2},{1}]},{"two",[{5},{2}]},{"four",[{4}]}],
	?assertEqual(Idx5, index_words(["one","two","one"], 5, Idx4)).

	%% Add a word to the index. This function is not tail-recursive but that will
	%% be OK unless there are lines containing very many words.
	%%
	%% Index is empty: start with this word.
	add_word(Word, N, []) -> [{Word, [{N}]}];
	%% Index contains this word: add line number.
	add_word(Word, N, [{Word, Entries} \| Rem]) ->
	[{Word, [{N} \| Entries]} \| Rem];
	%% Keep looking for this word in the index.
	add_word(Word, N, [Idx\|Rem]) -> [Idx \| add_word(Word, N, Rem)].

	add_word_test() ->
	Idx1 = [{"one",[{1}]}],
	?assertEqual(Idx1, add_word("one", 1, [])),
	Idx2 = [{"one",[{2},{1}]}],
	?assertEqual(Idx2, add_word("one", 2, Idx1)),
	Idx3 = [{"one",[{2},{1}]},{"two",[{2}]}],
	?assertEqual(Idx3, add_word("two", 2, Idx2)).

	%% Is the character a word character: A-Z or a-z.
	%% (Simple definition for now.)
	word_char(C) -> C >= $A andalso C =< $Z orelse C >= $a andalso C =< $z.

	word_char_test() ->
	?assert(word_char($A)),
	?assert(word_char($K)),
	?assert(word_char($z)),
	?assertNot(word_char($/)),
	?assertNot(word_char($-)),
	?assertNot(word_char($,)).

	%% Split the next word from the line, returning it and the remainder
	%% as a 2-tuple.
	next_word(Line) -> next_word(Line, []).
	next_word([], Word) -> {lists:reverse(Word), ""};
	next_word([Char\|Chars], Word) ->
	case word_char(Char) of
	true -> next_word(Chars, [Char\|Word]);
	false -> {lists:reverse(Word), Chars}
	end.

	next_word_test() ->
	?assertEqual({"", ""}, next_word("")),
	?assertEqual({"solo", ""}, next_word("solo")),
	?assertEqual({"Rhubarb", "is tasty"}, next_word("Rhubarb is tasty")).

	%% Filter a list, returning only non-empty items.
	non_empty(Xs) -> lists:reverse(non_empty(Xs, [])).
	non_empty([], NonEmpty) -> NonEmpty;
	non_empty([[]\|Xs], NonEmpty) -> non_empty(Xs, NonEmpty);
	non_empty([X\|Xs], NonEmpty) -> non_empty(Xs, [X\|NonEmpty]).

	non_empty_test() ->
	?assertEqual([], non_empty([[], []])),
	?assertEqual(["hello", "there"], non_empty([[], "hello", [], [], "there"])).

	%% Split a line of characters into a list of words.
	split_words(Line) -> lists:reverse(non_empty(split_words(Line, []))).
	split_words([], Words) -> Words;
	split_words(Line, Words) ->
	{First, Rem} = next_word(Line),
	split_words(Rem, [First\|Words]).

	split_words_test() ->
	?assertEqual([], split_words([])),
	?assertEqual(["this", "and", "that", "black", "white"],
	split_words("this and that: black -- white!")).

	%% Consolidate a raw index containing words with lists of line numbers
	%% in reverse order. An example is:
	%% [{"one", [{2},{1}]}, {"two", [{3},{1}]}]
	consolidate_index(RawIndex) ->
	lists:reverse(consolidate_index(RawIndex, [])).
	consolidate_index([], Index) -> Index;
	consolidate_index([{Word,DescNos}\|Rem], Index) ->
	AscNos = lists:reverse(DescNos),
	consolidate_index(Rem, [{Word, consolidate_nums(AscNos)} \| Index]).

	consolidate_index_test() ->
	?assertEqual([], consolidate_index([], [])),
	?assertEqual([{"one",[{1}]}], consolidate_index([{"one",[{1}]}], [])),
	?assertEqual([{"one",[{1,2}]},{"two",[{2}]}],
	consolidate_index([{"one",[{2},{1}]},{"two",[{2}]}])).

	%% Consolidate a sorted list of 1-tuples of numbers:
	%% - remove duplicates, thus {1},{1} -> {1}
	%% - reduce consecutive numbers to 2-tuples with first and last numbers,
	%% thus {1},{2},{3},{4} -> {1,4}
	%% - otherwise keep 1-tuples as they are.
	consolidate_nums(Nums) ->
	lists:reverse(consolidate_nums(Nums, [])).

	consolidate_nums([], Accum) ->
	Accum;
	consolidate_nums([{N}], Accum) ->
	[{N} \| Accum];
	consolidate_nums([{N},{N}\|Rem], Accum) ->
	consolidate_nums([{N}\|Rem], Accum);
	consolidate_nums([{N},{O}\|Rem], Accum) when O == N+1 ->
	consolidate_nums([{N,O}\|Rem], Accum);
	consolidate_nums([{N},{O}\|Rem], Accum) ->
	consolidate_nums([{O}\|Rem], [{N} \| Accum]);
	consolidate_nums([{N,O},{O}\|Rem], Accum) ->
	consolidate_nums([{N,O}\|Rem], Accum);
	consolidate_nums([{N,O},{P}\|Rem], Accum) when P == O+1 ->
	consolidate_nums([{N,P}\|Rem], Accum);
	consolidate_nums([{N,O},{P}\|Rem], Accum) ->
	consolidate_nums([{P}\|Rem], [{N,O} \| Accum]);
	consolidate_nums([Nums\|Rem], Accum) ->
	consolidate_nums(Rem, [Nums \| Accum]).

	consolidate_nums_test() ->
	?assertEqual([], consolidate_nums([])),
	?assertEqual([{1}], consolidate_nums([{1}])),
	?assertEqual([{1}], consolidate_nums([{1},{1}])),
	?assertEqual([{1},{3}], consolidate_nums([{1},{3}])),
	?assertEqual([{1,2}], consolidate_nums([{1},{2}])),
	?assertEqual([{1,2}], consolidate_nums([{1},{2},{2}])),
	?assertEqual([{1,4}], consolidate_nums([{1},{2},{2},{3},{4},{4}])),
	?assertEqual([{1,3},{5}], consolidate_nums([{1},{2},{3},{5}])),
	?assertEqual([{1,3},{5},{7,9},{12}],
	consolidate_nums([{1},{2},{3},{5},{7},{8},{9},{12}])),
	% Numbers must be consecutive!
	?assertEqual([{1},{3},{2}], consolidate_nums([{1},{3},{2}])).

	%%%%%%%%%%%%%%%%% Supplied utility functions %%%%%%%%%%%%%%%

	% Used to read a file into a list of lines.
	% Example files available in:
	% gettysburg-address.txt (short)
	% dickens-christmas.txt (long)


	% Get the contents of a text file into a list of lines.
	% Each line has its trailing newline removed.

	get_file_contents(Name) ->
	{ok,File} = file:open(Name,[read]),
	Rev = get_all_lines(File,[]),
	lists:reverse(Rev).

	% Auxiliary function for get_file_contents.
	% Not exported.

	get_all_lines(File,Partial) ->
	case io:get_line(File,"") of
	eof -> file:close(File),
	Partial;
	Line -> {Strip,_} = lists:split(length(Line)-1,Line),
	get_all_lines(File,[Strip\|Partial])
	end.

	% Show the contents of a list of strings.
	% Can be used to check the results of calling get_file_contents.

	show_file_contents([L\|Ls]) ->
	io:format("~s~n",[L]),
	show_file_contents(Ls);
	show_file_contents([]) ->
	ok.