NeilMadden/index.erl

## index.erl
-module(index).
-export([index_file/1,get_file_contents/1,show_file_contents/1]).

% Used to read a file into a list of lines.
% Example files available in:
%   gettysburg-address.txt (short)
%   dickens-christmas.txt  (long)


% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.

get_file_contents(Name) ->
    {ok,File} = file:open(Name,[read]),
    Rev = get_all_lines(File,[]),
lists:reverse(Rev).

% Auxiliary function for get_file_contents.
% Not exported.

get_all_lines(File,Partial) ->
    case io:get_line(File,"") of
        eof -> file:close(File),
               Partial;
        Line -> {Strip,_} = lists:split(length(Line)-1,Line),
                get_all_lines(File,[Strip|Partial])
    end.

% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.

show_file_contents([L|Ls]) ->
    io:format("~s~n",[L]),
    show_file_contents(Ls);
 show_file_contents([]) ->
    ok.

%% **** BEGIN SOLUTION ****

% We maintain our index using the form as described in the exercise, i.e.,
%       {Word, [{Start, End}, {Start, End}, ...]}
% Where each {Start, End} pair is a contiguous range of line numbers on which the
% given word occurs. Unlike the exercise, we reverse the order of the list so that
% the head element is always the most recently found line. This makes checking and
% updating the index cheap as we only have to check the head of the list at any
% point. We then

% index_file(Name) --
%
%   Indexes the given file and returns the index as a list of entries of the form
%       {Word, [{Start,End}, {Start,End}, ...]}
%   where each {Start, End} pair is a contiguous range of lines on which the given
%   word occurs.
%
index_file(Name) ->
    lists:keysort(1, reverse_indices(index_lines(get_file_contents(Name), 1, []))).

% reverse_indices(Index) --
%
%   Reverses the order of the occurrences for each word in the index, so that they
%   are in the order in which they occur in the file.
%
reverse_indices([]) ->
    [];
reverse_indices([{Word, Occurrences}|Rest]) ->
    [{Word, lists:reverse(Occurrences)} | reverse_indices(Rest)].

% index_lines(Lines, LineNumber, Index) --
%
%   Indexes the list of Lines given as input, updating the Index with the LineNumbers
%   on which each word occurs. Returns the updated index.
%
index_lines([], _, Index) ->
    Index;
index_lines([Line|Lines], Num, Index) ->
    % Split the line into words by whitespace and punctuation
    Words = string:tokens(Line, " \t,.!;:\"\'[](){}&-<`\\"),
    % Index all words within this line
    NewIndex = index_words(Words, Num, Index),
    % Index the remaining lines
    index_lines(Lines, Num+1, NewIndex).

% index_words(Words, LineNumber, Index) --
%
%   Index all the words in a line.
%
index_words([], _, Index) ->
    Index;
index_words([Word|Words], LineNum, Index) ->
    NormalizedWord = normalize(Word),
    % Find the word in the key list and update it with the new index entry
    NewIndex = case update_entry(NormalizedWord, LineNum,
                            find(NormalizedWord, Index)) of
                   % false if the word was too boring to index
                   false    -> Index;
                   NewEntry -> store(NormalizedWord, Index, NewEntry)
               end,
    % Now update the index with the new entry and index the remaining words
    index_words(Words, LineNum, NewIndex).

find(Word, Index) ->
    lists:keyfind(Word, 1, Index).

store(Word, Index, Entry) ->
    lists:keystore(Word, 1, Index, Entry).

% normalize(Word) --
%
%   Normalizes the given word. Currently just converts to lowercase rather than
%   performing stemming or other transformations.
%
normalize(Word) ->
    string:to_lower(Word).

% update_entry(Word, LineNumber, ExistingEntry | false) --
%
%   Updates an entry in the index to include the given word at the given line number.
%   If the entry does not exist then a new entry is created. If an entry exists that
%   already covers this line number, then we do nothing. If an entry exists that
%   extends up to the previous line, then we extend the range to include this line
%   too. Otherwise, we add a new range containing just this new line number.
%
update_entry(Word, _, Entry) when length(Word) < 3 ->
    Entry;
update_entry(Word, LineNum, false) ->
    {Word, [{LineNum, LineNum}]};
update_entry(Word, LineNum, Entry={Word, [{_,End}|_]}) when End == LineNum ->
    % Entry already covers this line
    Entry;
update_entry(Word, LineNum, {Word, [{Start,End}|Rest]}) when End == LineNum-1 ->
    % There is an existing entry that extends to the previous line, so update it
    {Word, [{Start,LineNum}|Rest]};
update_entry(Word, LineNum, {Word, Rest}) ->
    % This is a new occurrence that is discontiguous with any existing entry
    {Word, [{LineNum,LineNum}|Rest]}.

% Efficiency considerations:
%
%   The current approach just uses an unsorted key/value list to store the index,
%   which requires an O(N) linear scan to both find and update entries. Maintaining
%   a the list in sorted order (e.g., via some balanced binary tree structure) would
%   improve this to O(log N). This would also avoid an additional separate O(N log N)
%   sorting stage at the end.
%
%   Even better would be to use Erlang's new maps (http://erlang.org/eeps/eep-0043.html)
%   that are based on Hash Array Mapped Tries (HAMTs) that provide close to hash-table
%   O(1) lookup and insertion. This would be fairly simple to change, just replacing
%   the find/store procedures to use maps:get/2 and map update syntax and updating the
%   code to pass through an initially empty map. The map key would be the normalized
%   word, while the value would still be a list of occurrences (in most-recent-first order).
	-module(index).
	-export([index_file/1,get_file_contents/1,show_file_contents/1]).

	% Used to read a file into a list of lines.
	% Example files available in:
	% gettysburg-address.txt (short)
	% dickens-christmas.txt (long)


	% Get the contents of a text file into a list of lines.
	% Each line has its trailing newline removed.

	get_file_contents(Name) ->
	{ok,File} = file:open(Name,[read]),
	Rev = get_all_lines(File,[]),
	lists:reverse(Rev).

	% Auxiliary function for get_file_contents.
	% Not exported.

	get_all_lines(File,Partial) ->
	case io:get_line(File,"") of
	eof -> file:close(File),
	Partial;
	Line -> {Strip,_} = lists:split(length(Line)-1,Line),
	get_all_lines(File,[Strip\|Partial])
	end.

	% Show the contents of a list of strings.
	% Can be used to check the results of calling get_file_contents.

	show_file_contents([L\|Ls]) ->
	io:format("~s~n",[L]),
	show_file_contents(Ls);
	show_file_contents([]) ->
	ok.

	%% ** BEGIN SOLUTION **

	% We maintain our index using the form as described in the exercise, i.e.,
	% {Word, [{Start, End}, {Start, End}, ...]}
	% Where each {Start, End} pair is a contiguous range of line numbers on which the
	% given word occurs. Unlike the exercise, we reverse the order of the list so that
	% the head element is always the most recently found line. This makes checking and
	% updating the index cheap as we only have to check the head of the list at any
	% point. We then

	% index_file(Name) --
	%
	% Indexes the given file and returns the index as a list of entries of the form
	% {Word, [{Start,End}, {Start,End}, ...]}
	% where each {Start, End} pair is a contiguous range of lines on which the given
	% word occurs.
	%
	index_file(Name) ->
	lists:keysort(1, reverse_indices(index_lines(get_file_contents(Name), 1, []))).

	% reverse_indices(Index) --
	%
	% Reverses the order of the occurrences for each word in the index, so that they
	% are in the order in which they occur in the file.
	%
	reverse_indices([]) ->
	[];
	reverse_indices([{Word, Occurrences}\|Rest]) ->
	[{Word, lists:reverse(Occurrences)} \| reverse_indices(Rest)].

	% index_lines(Lines, LineNumber, Index) --
	%
	% Indexes the list of Lines given as input, updating the Index with the LineNumbers
	% on which each word occurs. Returns the updated index.
	%
	index_lines([], _, Index) ->
	Index;
	index_lines([Line\|Lines], Num, Index) ->
	% Split the line into words by whitespace and punctuation
	Words = string:tokens(Line, " \t,.!;:\"\'[](){}&-<`\\"),
	% Index all words within this line
	NewIndex = index_words(Words, Num, Index),
	% Index the remaining lines
	index_lines(Lines, Num+1, NewIndex).

	% index_words(Words, LineNumber, Index) --
	%
	% Index all the words in a line.
	%
	index_words([], _, Index) ->
	Index;
	index_words([Word\|Words], LineNum, Index) ->
	NormalizedWord = normalize(Word),
	% Find the word in the key list and update it with the new index entry
	NewIndex = case update_entry(NormalizedWord, LineNum,
	find(NormalizedWord, Index)) of
	% false if the word was too boring to index
	false -> Index;
	NewEntry -> store(NormalizedWord, Index, NewEntry)
	end,
	% Now update the index with the new entry and index the remaining words
	index_words(Words, LineNum, NewIndex).

	find(Word, Index) ->
	lists:keyfind(Word, 1, Index).

	store(Word, Index, Entry) ->
	lists:keystore(Word, 1, Index, Entry).

	% normalize(Word) --
	%
	% Normalizes the given word. Currently just converts to lowercase rather than
	% performing stemming or other transformations.
	%
	normalize(Word) ->
	string:to_lower(Word).

	% update_entry(Word, LineNumber, ExistingEntry \| false) --
	%
	% Updates an entry in the index to include the given word at the given line number.
	% If the entry does not exist then a new entry is created. If an entry exists that
	% already covers this line number, then we do nothing. If an entry exists that
	% extends up to the previous line, then we extend the range to include this line
	% too. Otherwise, we add a new range containing just this new line number.
	%
	update_entry(Word, _, Entry) when length(Word) < 3 ->
	Entry;
	update_entry(Word, LineNum, false) ->
	{Word, [{LineNum, LineNum}]};
	update_entry(Word, LineNum, Entry={Word, [{_,End}\|_]}) when End == LineNum ->
	% Entry already covers this line
	Entry;
	update_entry(Word, LineNum, {Word, [{Start,End}\|Rest]}) when End == LineNum-1 ->
	% There is an existing entry that extends to the previous line, so update it
	{Word, [{Start,LineNum}\|Rest]};
	update_entry(Word, LineNum, {Word, Rest}) ->
	% This is a new occurrence that is discontiguous with any existing entry
	{Word, [{LineNum,LineNum}\|Rest]}.

	% Efficiency considerations:
	%
	% The current approach just uses an unsorted key/value list to store the index,
	% which requires an O(N) linear scan to both find and update entries. Maintaining
	% a the list in sorted order (e.g., via some balanced binary tree structure) would
	% improve this to O(log N). This would also avoid an additional separate O(N log N)
	% sorting stage at the end.
	%
	% Even better would be to use Erlang's new maps (http://erlang.org/eeps/eep-0043.html)
	% that are based on Hash Array Mapped Tries (HAMTs) that provide close to hash-table
	% O(1) lookup and insertion. This would be fairly simple to change, just replacing
	% the find/store procedures to use maps:get/2 and map update syntax and updating the
	% code to pass through an initially empty map. The map key would be the normalized
	% word, while the value would still be a list of occurrences (in most-recent-first order).