Skip to content

Instantly share code, notes, and snippets.

@colinbankier
Created March 8, 2017 12:45
Show Gist options
  • Save colinbankier/3273bdca9fda41662ef9189e07142b2b to your computer and use it in GitHub Desktop.
Save colinbankier/3273bdca9fda41662ef9189e07142b2b to your computer and use it in GitHub Desktop.
Indexing a file - Erlang
-module(index).
-export([get_index/1]).
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% The main entry to the module, returns the index for the given file name.
get_index(FileName) ->
Contents = get_file_contents(FileName),
build_index(Contents, 1, #{}).
% Builds the index recursively, accumulating entries and associated line
% numbers in the Index parameter.
% I used a Map data structure here for easier lookup and adding to as lines are
% processed.
% In the base case, when in the index is complete, the list of line numbers is
% grouped into contiguous ranges.
% The final index is then sorted by word.
% I decided to utilise a few functions from the Erlang std lib,
% e.g. lists:sort, string:to_lower, string:tokens
% given that we have implemented our own sort and similar functions in previous exercises,
% and using these % allowed more time to focus on other aspects of the challenge.
build_index([], _LineNum, Index) ->
GroupedIndex = group_values_by_range(maps:to_list(Index)),
lists:sort(fun(A, B) -> A =< B end, GroupedIndex);
build_index([H|T], LineNum, Index) ->
LineWords = nub(tokenize(string:to_lower(H))),
FilteredWords = filter_short_words(LineWords),
NewIndex = index_words(FilteredWords, LineNum, Index),
build_index(T, LineNum + 1, NewIndex).
% Indexes words from a single line, adding an entry to Index for LineNum.
index_words([], _LineNum, Index) ->
Index;
index_words([H|T], LineNum, Index) ->
Lines = maps:get(H, Index, []),
NewIndex = maps:put(H, [LineNum|Lines], Index),
index_words(T, LineNum, NewIndex).
% Splits a line into individual words.
% Std lib function used.
tokenize(Line) ->
string:tokens(Line, " ,.;?!`[]()\\\"").
% Removes short words from being indexed.
% Could be extended to filter common words such as "and".
filter_short_words(Words) when is_list(Words) ->
filter_short_words(Words, []).
filter_short_words([], Acc) ->
Acc;
filter_short_words([H|T], Acc) ->
case length(H) < 3 of
true ->
filter_short_words(T, Acc);
false ->
filter_short_words(T, [H|Acc])
end.
% Groups the values by range for an index of Key,Value pairs.
% The values are assumed to be a list of numbers which are
% group into contiguous ranges.
group_values_by_range(Pairs) when is_list(Pairs) ->
group_values_by_range(Pairs, []).
group_values_by_range([{Key, Value}|T], Acc) ->
group_values_by_range(T, [{Key, group_by_range(lists:sort(Value))}|Acc]);
group_values_by_range([], Acc) ->
Acc.
% Groups a list of numbers into contiguous ranges.
group_by_range([]) ->
[];
group_by_range([H|T]) ->
group_by_range(T, {H,H}, []).
group_by_range([], Range, Acc) ->
lists:reverse([Range|Acc]);
group_by_range([H|T], {S,E}, Acc) when E+1 == H ->
group_by_range(T, {S,H}, Acc);
group_by_range([H|T], Range={_S,_E}, Acc) ->
group_by_range(T, {H,H},[Range|Acc]).
% Returns a list with duplicate elements removed.
nub(List) ->
lists:reverse(nub(List, [])).
nub([], Acc) ->
Acc;
nub([H|T], Acc) ->
NextAcc = case lists:member(H, Acc) of
true -> Acc;
false -> [H | Acc]
end,
nub(T, NextAcc).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment