colinbankier/index.erl

## index.erl
-module(index).
-export([get_index/1]).

% Used to read a file into a list of lines.
% Example files available in:
%   gettysburg-address.txt (short)
%   dickens-christmas.txt  (long)


% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.

get_file_contents(Name) ->
    {ok,File} = file:open(Name,[read]),
    Rev = get_all_lines(File,[]),
lists:reverse(Rev).

% Auxiliary function for get_file_contents.
% Not exported.

get_all_lines(File,Partial) ->
    case io:get_line(File,"") of
        eof -> file:close(File),
               Partial;
        Line -> {Strip,_} = lists:split(length(Line)-1,Line),
                get_all_lines(File,[Strip|Partial])
    end.

% The main entry to the module, returns the index for the given file name.
get_index(FileName) ->
    Contents = get_file_contents(FileName),
    build_index(Contents, 1, #{}).

% Builds the index recursively, accumulating entries and associated line
% numbers in the Index parameter.
% I used a Map data structure here for easier lookup and adding to as lines are
% processed.
% In the base case, when in the index is complete, the list of line numbers is
% grouped into contiguous ranges.
% The final index is then sorted by word.
% I decided to utilise a few functions from the Erlang std lib,
% e.g. lists:sort, string:to_lower, string:tokens
% given that we have implemented our own sort and similar functions in previous exercises,
% and using these % allowed more time to focus on other aspects of the challenge.
build_index([], _LineNum, Index) ->
    GroupedIndex = group_values_by_range(maps:to_list(Index)),
    lists:sort(fun(A, B) -> A =< B end, GroupedIndex);
build_index([H|T], LineNum, Index) ->
    LineWords = nub(tokenize(string:to_lower(H))),
    FilteredWords = filter_short_words(LineWords),
    NewIndex = index_words(FilteredWords, LineNum, Index),
    build_index(T, LineNum + 1, NewIndex).

% Indexes words from a single line, adding an entry to Index for LineNum.
index_words([], _LineNum, Index) ->
    Index;
index_words([H|T], LineNum, Index) ->
    Lines = maps:get(H, Index, []),
    NewIndex = maps:put(H, [LineNum|Lines], Index),
    index_words(T, LineNum, NewIndex).

% Splits a line into individual words.
% Std lib function used.
tokenize(Line) ->
    string:tokens(Line, " ,.;?!`[]()\\\"").

% Removes short words from being indexed.
% Could be extended to filter common words such as "and".
filter_short_words(Words) when is_list(Words) ->
    filter_short_words(Words, []).
filter_short_words([], Acc) ->
    Acc;
filter_short_words([H|T], Acc) ->
    case length(H) < 3 of
        true ->
            filter_short_words(T, Acc);
        false ->
            filter_short_words(T, [H|Acc])
    end.

% Groups the values by range for an index of Key,Value pairs.
% The values are assumed to be a list of numbers which are
% group into contiguous ranges.
group_values_by_range(Pairs) when is_list(Pairs) ->
    group_values_by_range(Pairs, []).
group_values_by_range([{Key, Value}|T], Acc) ->
    group_values_by_range(T, [{Key, group_by_range(lists:sort(Value))}|Acc]);
group_values_by_range([], Acc) ->
    Acc.

% Groups a list of numbers into contiguous ranges.
group_by_range([]) ->
    [];
group_by_range([H|T]) ->
    group_by_range(T, {H,H}, []).
group_by_range([], Range, Acc) ->
    lists:reverse([Range|Acc]);
group_by_range([H|T], {S,E}, Acc) when E+1 == H ->
    group_by_range(T, {S,H}, Acc);
group_by_range([H|T], Range={_S,_E}, Acc) ->
    group_by_range(T, {H,H},[Range|Acc]).

% Returns a list with duplicate elements removed.
nub(List) ->
    lists:reverse(nub(List, [])).
nub([], Acc) ->
    Acc;
nub([H|T], Acc) ->
    NextAcc = case lists:member(H, Acc) of
                  true -> Acc;
                  false -> [H | Acc]
              end,
    nub(T, NextAcc).
	-module(index).
	-export([get_index/1]).

	% Used to read a file into a list of lines.
	% Example files available in:
	% gettysburg-address.txt (short)
	% dickens-christmas.txt (long)


	% Get the contents of a text file into a list of lines.
	% Each line has its trailing newline removed.

	get_file_contents(Name) ->
	{ok,File} = file:open(Name,[read]),
	Rev = get_all_lines(File,[]),
	lists:reverse(Rev).

	% Auxiliary function for get_file_contents.
	% Not exported.

	get_all_lines(File,Partial) ->
	case io:get_line(File,"") of
	eof -> file:close(File),
	Partial;
	Line -> {Strip,_} = lists:split(length(Line)-1,Line),
	get_all_lines(File,[Strip\|Partial])
	end.

	% The main entry to the module, returns the index for the given file name.
	get_index(FileName) ->
	Contents = get_file_contents(FileName),
	build_index(Contents, 1, #{}).

	% Builds the index recursively, accumulating entries and associated line
	% numbers in the Index parameter.
	% I used a Map data structure here for easier lookup and adding to as lines are
	% processed.
	% In the base case, when in the index is complete, the list of line numbers is
	% grouped into contiguous ranges.
	% The final index is then sorted by word.
	% I decided to utilise a few functions from the Erlang std lib,
	% e.g. lists:sort, string:to_lower, string:tokens
	% given that we have implemented our own sort and similar functions in previous exercises,
	% and using these % allowed more time to focus on other aspects of the challenge.
	build_index([], _LineNum, Index) ->
	GroupedIndex = group_values_by_range(maps:to_list(Index)),
	lists:sort(fun(A, B) -> A =< B end, GroupedIndex);
	build_index([H\|T], LineNum, Index) ->
	LineWords = nub(tokenize(string:to_lower(H))),
	FilteredWords = filter_short_words(LineWords),
	NewIndex = index_words(FilteredWords, LineNum, Index),
	build_index(T, LineNum + 1, NewIndex).

	% Indexes words from a single line, adding an entry to Index for LineNum.
	index_words([], _LineNum, Index) ->
	Index;
	index_words([H\|T], LineNum, Index) ->
	Lines = maps:get(H, Index, []),
	NewIndex = maps:put(H, [LineNum\|Lines], Index),
	index_words(T, LineNum, NewIndex).

	% Splits a line into individual words.
	% Std lib function used.
	tokenize(Line) ->
	string:tokens(Line, " ,.;?!`[]()\\\"").

	% Removes short words from being indexed.
	% Could be extended to filter common words such as "and".
	filter_short_words(Words) when is_list(Words) ->
	filter_short_words(Words, []).
	filter_short_words([], Acc) ->
	Acc;
	filter_short_words([H\|T], Acc) ->
	case length(H) < 3 of
	true ->
	filter_short_words(T, Acc);
	false ->
	filter_short_words(T, [H\|Acc])
	end.

	% Groups the values by range for an index of Key,Value pairs.
	% The values are assumed to be a list of numbers which are
	% group into contiguous ranges.
	group_values_by_range(Pairs) when is_list(Pairs) ->
	group_values_by_range(Pairs, []).
	group_values_by_range([{Key, Value}\|T], Acc) ->
	group_values_by_range(T, [{Key, group_by_range(lists:sort(Value))}\|Acc]);
	group_values_by_range([], Acc) ->
	Acc.

	% Groups a list of numbers into contiguous ranges.
	group_by_range([]) ->
	[];
	group_by_range([H\|T]) ->
	group_by_range(T, {H,H}, []).
	group_by_range([], Range, Acc) ->
	lists:reverse([Range\|Acc]);
	group_by_range([H\|T], {S,E}, Acc) when E+1 == H ->
	group_by_range(T, {S,H}, Acc);
	group_by_range([H\|T], Range={_S,_E}, Acc) ->
	group_by_range(T, {H,H},[Range\|Acc]).

	% Returns a list with duplicate elements removed.
	nub(List) ->
	lists:reverse(nub(List, [])).
	nub([], Acc) ->
	Acc;
	nub([H\|T], Acc) ->
	NextAcc = case lists:member(H, Acc) of
	true -> Acc;
	false -> [H \| Acc]
	end,
	nub(T, NextAcc).