Skip to content

Instantly share code, notes, and snippets.

@NeilMadden
Created February 28, 2017 17:23
Show Gist options
  • Save NeilMadden/67ba9997a054943b99d1b765edd92158 to your computer and use it in GitHub Desktop.
Save NeilMadden/67ba9997a054943b99d1b765edd92158 to your computer and use it in GitHub Desktop.
Neil Madden "Functional Programming in Erlang MOOC" Submission for Week 2
-module(index).
-export([index_file/1,get_file_contents/1,show_file_contents/1]).
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
%% **** BEGIN SOLUTION ****
% We maintain our index using the form as described in the exercise, i.e.,
% {Word, [{Start, End}, {Start, End}, ...]}
% Where each {Start, End} pair is a contiguous range of line numbers on which the
% given word occurs. Unlike the exercise, we reverse the order of the list so that
% the head element is always the most recently found line. This makes checking and
% updating the index cheap as we only have to check the head of the list at any
% point. We then
% index_file(Name) --
%
% Indexes the given file and returns the index as a list of entries of the form
% {Word, [{Start,End}, {Start,End}, ...]}
% where each {Start, End} pair is a contiguous range of lines on which the given
% word occurs.
%
index_file(Name) ->
lists:keysort(1, reverse_indices(index_lines(get_file_contents(Name), 1, []))).
% reverse_indices(Index) --
%
% Reverses the order of the occurrences for each word in the index, so that they
% are in the order in which they occur in the file.
%
reverse_indices([]) ->
[];
reverse_indices([{Word, Occurrences}|Rest]) ->
[{Word, lists:reverse(Occurrences)} | reverse_indices(Rest)].
% index_lines(Lines, LineNumber, Index) --
%
% Indexes the list of Lines given as input, updating the Index with the LineNumbers
% on which each word occurs. Returns the updated index.
%
index_lines([], _, Index) ->
Index;
index_lines([Line|Lines], Num, Index) ->
% Split the line into words by whitespace and punctuation
Words = string:tokens(Line, " \t,.!;:\"\'[](){}&-<`\\"),
% Index all words within this line
NewIndex = index_words(Words, Num, Index),
% Index the remaining lines
index_lines(Lines, Num+1, NewIndex).
% index_words(Words, LineNumber, Index) --
%
% Index all the words in a line.
%
index_words([], _, Index) ->
Index;
index_words([Word|Words], LineNum, Index) ->
NormalizedWord = normalize(Word),
% Find the word in the key list and update it with the new index entry
NewIndex = case update_entry(NormalizedWord, LineNum,
find(NormalizedWord, Index)) of
% false if the word was too boring to index
false -> Index;
NewEntry -> store(NormalizedWord, Index, NewEntry)
end,
% Now update the index with the new entry and index the remaining words
index_words(Words, LineNum, NewIndex).
find(Word, Index) ->
lists:keyfind(Word, 1, Index).
store(Word, Index, Entry) ->
lists:keystore(Word, 1, Index, Entry).
% normalize(Word) --
%
% Normalizes the given word. Currently just converts to lowercase rather than
% performing stemming or other transformations.
%
normalize(Word) ->
string:to_lower(Word).
% update_entry(Word, LineNumber, ExistingEntry | false) --
%
% Updates an entry in the index to include the given word at the given line number.
% If the entry does not exist then a new entry is created. If an entry exists that
% already covers this line number, then we do nothing. If an entry exists that
% extends up to the previous line, then we extend the range to include this line
% too. Otherwise, we add a new range containing just this new line number.
%
update_entry(Word, _, Entry) when length(Word) < 3 ->
Entry;
update_entry(Word, LineNum, false) ->
{Word, [{LineNum, LineNum}]};
update_entry(Word, LineNum, Entry={Word, [{_,End}|_]}) when End == LineNum ->
% Entry already covers this line
Entry;
update_entry(Word, LineNum, {Word, [{Start,End}|Rest]}) when End == LineNum-1 ->
% There is an existing entry that extends to the previous line, so update it
{Word, [{Start,LineNum}|Rest]};
update_entry(Word, LineNum, {Word, Rest}) ->
% This is a new occurrence that is discontiguous with any existing entry
{Word, [{LineNum,LineNum}|Rest]}.
% Efficiency considerations:
%
% The current approach just uses an unsorted key/value list to store the index,
% which requires an O(N) linear scan to both find and update entries. Maintaining
% a the list in sorted order (e.g., via some balanced binary tree structure) would
% improve this to O(log N). This would also avoid an additional separate O(N log N)
% sorting stage at the end.
%
% Even better would be to use Erlang's new maps (http://erlang.org/eeps/eep-0043.html)
% that are based on Hash Array Mapped Tries (HAMTs) that provide close to hash-table
% O(1) lookup and insertion. This would be fairly simple to change, just replacing
% the find/store procedures to use maps:get/2 and map update syntax and updating the
% code to pass through an initially empty map. The map key would be the normalized
% word, while the value would still be a list of occurrences (in most-recent-first order).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment