Skip to content

Instantly share code, notes, and snippets.

@mjstrasser
Created March 13, 2017 04:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mjstrasser/8a1c331c501e370125e7fc6efa08321a to your computer and use it in GitHub Desktop.
Save mjstrasser/8a1c331c501e370125e7fc6efa08321a to your computer and use it in GitHub Desktop.
Index generator for week 2 assignment in Functional Programming in Erlang MOOC from University of Kent (futurelearn.com)
-module(index).
-export([get_file_contents/1,show_file_contents/1,
index_file/1, word_index/1]).
-include_lib("eunit/include/eunit.hrl").
%% Index the words in the specified file.
index_file(Filename) ->
word_index(get_file_contents(Filename)).
%% Construct an index with the lines that each word appears in the supplied
%% text as a list of lines of text.
word_index(Lines) ->
% RawIndex has entries for every word occurrence with duplicates
% in reverse line order.
RawIndex = index_lines(Lines, 1, []),
% Consolidate the line numbers in the raw index.
consolidate_index(RawIndex).
% Simple test.
word_index_test() ->
?assertEqual([
{"this", [{1,2}]},
{"is", [{1,3}]},
{"fun", [{1}]},
{"clever", [{2}]},
{"that", [{3}]},
{"good", [{3}]}
],
word_index([
"this is fun",
"this is clever",
"that is good",
""
])
).
%% Index the words in a list of lines of text.
index_lines([], _, Index) -> Index;
index_lines([Line|Rem], N, Index) ->
Words = split_words(Line),
index_lines(Rem, N+1, index_words(Words, N, Index)).
%% Index the words for a line of text line with supplied number.
%% The generated raw
index_words([], _, Index) -> Index;
index_words([Word|Rem], N, Index) ->
index_words(Rem, N, add_word(Word, N, Index)).
index_words_test() ->
?assertEqual([], index_words([], 1, [])),
Idx1 = [{"one",[{1}]}],
?assertEqual(Idx1, index_words(["one"], 1, [])),
Idx2 = [{"one",[{2},{1}]},{"two",[{2}]}],
?assertEqual(Idx2, index_words(["one","two"], 2, Idx1)),
?assertEqual(Idx2, index_words([], 3, Idx2)),
Idx4 = [{"one",[{4},{2},{1}]},{"two",[{2}]},{"four",[{4}]}],
?assertEqual(Idx4, index_words(["one","four"], 4, Idx2)),
Idx5 = [{"one",[{5},{5},{4},{2},{1}]},{"two",[{5},{2}]},{"four",[{4}]}],
?assertEqual(Idx5, index_words(["one","two","one"], 5, Idx4)).
%% Add a word to the index. This function is not tail-recursive but that will
%% be OK unless there are lines containing very many words.
%%
%% Index is empty: start with this word.
add_word(Word, N, []) -> [{Word, [{N}]}];
%% Index contains this word: add line number.
add_word(Word, N, [{Word, Entries} | Rem]) ->
[{Word, [{N} | Entries]} | Rem];
%% Keep looking for this word in the index.
add_word(Word, N, [Idx|Rem]) -> [Idx | add_word(Word, N, Rem)].
add_word_test() ->
Idx1 = [{"one",[{1}]}],
?assertEqual(Idx1, add_word("one", 1, [])),
Idx2 = [{"one",[{2},{1}]}],
?assertEqual(Idx2, add_word("one", 2, Idx1)),
Idx3 = [{"one",[{2},{1}]},{"two",[{2}]}],
?assertEqual(Idx3, add_word("two", 2, Idx2)).
%% Is the character a word character: A-Z or a-z.
%% (Simple definition for now.)
word_char(C) -> C >= $A andalso C =< $Z orelse C >= $a andalso C =< $z.
word_char_test() ->
?assert(word_char($A)),
?assert(word_char($K)),
?assert(word_char($z)),
?assertNot(word_char($/)),
?assertNot(word_char($-)),
?assertNot(word_char($,)).
%% Split the next word from the line, returning it and the remainder
%% as a 2-tuple.
next_word(Line) -> next_word(Line, []).
next_word([], Word) -> {lists:reverse(Word), ""};
next_word([Char|Chars], Word) ->
case word_char(Char) of
true -> next_word(Chars, [Char|Word]);
false -> {lists:reverse(Word), Chars}
end.
next_word_test() ->
?assertEqual({"", ""}, next_word("")),
?assertEqual({"solo", ""}, next_word("solo")),
?assertEqual({"Rhubarb", "is tasty"}, next_word("Rhubarb is tasty")).
%% Filter a list, returning only non-empty items.
non_empty(Xs) -> lists:reverse(non_empty(Xs, [])).
non_empty([], NonEmpty) -> NonEmpty;
non_empty([[]|Xs], NonEmpty) -> non_empty(Xs, NonEmpty);
non_empty([X|Xs], NonEmpty) -> non_empty(Xs, [X|NonEmpty]).
non_empty_test() ->
?assertEqual([], non_empty([[], []])),
?assertEqual(["hello", "there"], non_empty([[], "hello", [], [], "there"])).
%% Split a line of characters into a list of words.
split_words(Line) -> lists:reverse(non_empty(split_words(Line, []))).
split_words([], Words) -> Words;
split_words(Line, Words) ->
{First, Rem} = next_word(Line),
split_words(Rem, [First|Words]).
split_words_test() ->
?assertEqual([], split_words([])),
?assertEqual(["this", "and", "that", "black", "white"],
split_words("this and that: black -- white!")).
%% Consolidate a raw index containing words with lists of line numbers
%% in reverse order. An example is:
%% [{"one", [{2},{1}]}, {"two", [{3},{1}]}]
consolidate_index(RawIndex) ->
lists:reverse(consolidate_index(RawIndex, [])).
consolidate_index([], Index) -> Index;
consolidate_index([{Word,DescNos}|Rem], Index) ->
AscNos = lists:reverse(DescNos),
consolidate_index(Rem, [{Word, consolidate_nums(AscNos)} | Index]).
consolidate_index_test() ->
?assertEqual([], consolidate_index([], [])),
?assertEqual([{"one",[{1}]}], consolidate_index([{"one",[{1}]}], [])),
?assertEqual([{"one",[{1,2}]},{"two",[{2}]}],
consolidate_index([{"one",[{2},{1}]},{"two",[{2}]}])).
%% Consolidate a sorted list of 1-tuples of numbers:
%% - remove duplicates, thus {1},{1} -> {1}
%% - reduce consecutive numbers to 2-tuples with first and last numbers,
%% thus {1},{2},{3},{4} -> {1,4}
%% - otherwise keep 1-tuples as they are.
consolidate_nums(Nums) ->
lists:reverse(consolidate_nums(Nums, [])).
consolidate_nums([], Accum) ->
Accum;
consolidate_nums([{N}], Accum) ->
[{N} | Accum];
consolidate_nums([{N},{N}|Rem], Accum) ->
consolidate_nums([{N}|Rem], Accum);
consolidate_nums([{N},{O}|Rem], Accum) when O == N+1 ->
consolidate_nums([{N,O}|Rem], Accum);
consolidate_nums([{N},{O}|Rem], Accum) ->
consolidate_nums([{O}|Rem], [{N} | Accum]);
consolidate_nums([{N,O},{O}|Rem], Accum) ->
consolidate_nums([{N,O}|Rem], Accum);
consolidate_nums([{N,O},{P}|Rem], Accum) when P == O+1 ->
consolidate_nums([{N,P}|Rem], Accum);
consolidate_nums([{N,O},{P}|Rem], Accum) ->
consolidate_nums([{P}|Rem], [{N,O} | Accum]);
consolidate_nums([Nums|Rem], Accum) ->
consolidate_nums(Rem, [Nums | Accum]).
consolidate_nums_test() ->
?assertEqual([], consolidate_nums([])),
?assertEqual([{1}], consolidate_nums([{1}])),
?assertEqual([{1}], consolidate_nums([{1},{1}])),
?assertEqual([{1},{3}], consolidate_nums([{1},{3}])),
?assertEqual([{1,2}], consolidate_nums([{1},{2}])),
?assertEqual([{1,2}], consolidate_nums([{1},{2},{2}])),
?assertEqual([{1,4}], consolidate_nums([{1},{2},{2},{3},{4},{4}])),
?assertEqual([{1,3},{5}], consolidate_nums([{1},{2},{3},{5}])),
?assertEqual([{1,3},{5},{7,9},{12}],
consolidate_nums([{1},{2},{3},{5},{7},{8},{9},{12}])),
% Numbers must be consecutive!
?assertEqual([{1},{3},{2}], consolidate_nums([{1},{3},{2}])).
%%%%%%%%%%%%%%%%% Supplied utility functions %%%%%%%%%%%%%%%
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment