Created
March 13, 2017 04:46
-
-
Save mjstrasser/8a1c331c501e370125e7fc6efa08321a to your computer and use it in GitHub Desktop.
Index generator for week 2 assignment in Functional Programming in Erlang MOOC from University of Kent (futurelearn.com)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(index). | |
-export([get_file_contents/1,show_file_contents/1, | |
index_file/1, word_index/1]). | |
-include_lib("eunit/include/eunit.hrl"). | |
%% Index the words in the specified file. | |
index_file(Filename) -> | |
word_index(get_file_contents(Filename)). | |
%% Construct an index with the lines that each word appears in the supplied | |
%% text as a list of lines of text. | |
word_index(Lines) -> | |
% RawIndex has entries for every word occurrence with duplicates | |
% in reverse line order. | |
RawIndex = index_lines(Lines, 1, []), | |
% Consolidate the line numbers in the raw index. | |
consolidate_index(RawIndex). | |
% Simple test. | |
word_index_test() -> | |
?assertEqual([ | |
{"this", [{1,2}]}, | |
{"is", [{1,3}]}, | |
{"fun", [{1}]}, | |
{"clever", [{2}]}, | |
{"that", [{3}]}, | |
{"good", [{3}]} | |
], | |
word_index([ | |
"this is fun", | |
"this is clever", | |
"that is good", | |
"" | |
]) | |
). | |
%% Index the words in a list of lines of text. | |
index_lines([], _, Index) -> Index; | |
index_lines([Line|Rem], N, Index) -> | |
Words = split_words(Line), | |
index_lines(Rem, N+1, index_words(Words, N, Index)). | |
%% Index the words for a line of text line with supplied number. | |
%% The generated raw | |
index_words([], _, Index) -> Index; | |
index_words([Word|Rem], N, Index) -> | |
index_words(Rem, N, add_word(Word, N, Index)). | |
index_words_test() -> | |
?assertEqual([], index_words([], 1, [])), | |
Idx1 = [{"one",[{1}]}], | |
?assertEqual(Idx1, index_words(["one"], 1, [])), | |
Idx2 = [{"one",[{2},{1}]},{"two",[{2}]}], | |
?assertEqual(Idx2, index_words(["one","two"], 2, Idx1)), | |
?assertEqual(Idx2, index_words([], 3, Idx2)), | |
Idx4 = [{"one",[{4},{2},{1}]},{"two",[{2}]},{"four",[{4}]}], | |
?assertEqual(Idx4, index_words(["one","four"], 4, Idx2)), | |
Idx5 = [{"one",[{5},{5},{4},{2},{1}]},{"two",[{5},{2}]},{"four",[{4}]}], | |
?assertEqual(Idx5, index_words(["one","two","one"], 5, Idx4)). | |
%% Add a word to the index. This function is not tail-recursive but that will | |
%% be OK unless there are lines containing very many words. | |
%% | |
%% Index is empty: start with this word. | |
add_word(Word, N, []) -> [{Word, [{N}]}]; | |
%% Index contains this word: add line number. | |
add_word(Word, N, [{Word, Entries} | Rem]) -> | |
[{Word, [{N} | Entries]} | Rem]; | |
%% Keep looking for this word in the index. | |
add_word(Word, N, [Idx|Rem]) -> [Idx | add_word(Word, N, Rem)]. | |
add_word_test() -> | |
Idx1 = [{"one",[{1}]}], | |
?assertEqual(Idx1, add_word("one", 1, [])), | |
Idx2 = [{"one",[{2},{1}]}], | |
?assertEqual(Idx2, add_word("one", 2, Idx1)), | |
Idx3 = [{"one",[{2},{1}]},{"two",[{2}]}], | |
?assertEqual(Idx3, add_word("two", 2, Idx2)). | |
%% Is the character a word character: A-Z or a-z. | |
%% (Simple definition for now.) | |
word_char(C) -> C >= $A andalso C =< $Z orelse C >= $a andalso C =< $z. | |
word_char_test() -> | |
?assert(word_char($A)), | |
?assert(word_char($K)), | |
?assert(word_char($z)), | |
?assertNot(word_char($/)), | |
?assertNot(word_char($-)), | |
?assertNot(word_char($,)). | |
%% Split the next word from the line, returning it and the remainder | |
%% as a 2-tuple. | |
next_word(Line) -> next_word(Line, []). | |
next_word([], Word) -> {lists:reverse(Word), ""}; | |
next_word([Char|Chars], Word) -> | |
case word_char(Char) of | |
true -> next_word(Chars, [Char|Word]); | |
false -> {lists:reverse(Word), Chars} | |
end. | |
next_word_test() -> | |
?assertEqual({"", ""}, next_word("")), | |
?assertEqual({"solo", ""}, next_word("solo")), | |
?assertEqual({"Rhubarb", "is tasty"}, next_word("Rhubarb is tasty")). | |
%% Filter a list, returning only non-empty items. | |
non_empty(Xs) -> lists:reverse(non_empty(Xs, [])). | |
non_empty([], NonEmpty) -> NonEmpty; | |
non_empty([[]|Xs], NonEmpty) -> non_empty(Xs, NonEmpty); | |
non_empty([X|Xs], NonEmpty) -> non_empty(Xs, [X|NonEmpty]). | |
non_empty_test() -> | |
?assertEqual([], non_empty([[], []])), | |
?assertEqual(["hello", "there"], non_empty([[], "hello", [], [], "there"])). | |
%% Split a line of characters into a list of words. | |
split_words(Line) -> lists:reverse(non_empty(split_words(Line, []))). | |
split_words([], Words) -> Words; | |
split_words(Line, Words) -> | |
{First, Rem} = next_word(Line), | |
split_words(Rem, [First|Words]). | |
split_words_test() -> | |
?assertEqual([], split_words([])), | |
?assertEqual(["this", "and", "that", "black", "white"], | |
split_words("this and that: black -- white!")). | |
%% Consolidate a raw index containing words with lists of line numbers | |
%% in reverse order. An example is: | |
%% [{"one", [{2},{1}]}, {"two", [{3},{1}]}] | |
consolidate_index(RawIndex) -> | |
lists:reverse(consolidate_index(RawIndex, [])). | |
consolidate_index([], Index) -> Index; | |
consolidate_index([{Word,DescNos}|Rem], Index) -> | |
AscNos = lists:reverse(DescNos), | |
consolidate_index(Rem, [{Word, consolidate_nums(AscNos)} | Index]). | |
consolidate_index_test() -> | |
?assertEqual([], consolidate_index([], [])), | |
?assertEqual([{"one",[{1}]}], consolidate_index([{"one",[{1}]}], [])), | |
?assertEqual([{"one",[{1,2}]},{"two",[{2}]}], | |
consolidate_index([{"one",[{2},{1}]},{"two",[{2}]}])). | |
%% Consolidate a sorted list of 1-tuples of numbers: | |
%% - remove duplicates, thus {1},{1} -> {1} | |
%% - reduce consecutive numbers to 2-tuples with first and last numbers, | |
%% thus {1},{2},{3},{4} -> {1,4} | |
%% - otherwise keep 1-tuples as they are. | |
consolidate_nums(Nums) -> | |
lists:reverse(consolidate_nums(Nums, [])). | |
consolidate_nums([], Accum) -> | |
Accum; | |
consolidate_nums([{N}], Accum) -> | |
[{N} | Accum]; | |
consolidate_nums([{N},{N}|Rem], Accum) -> | |
consolidate_nums([{N}|Rem], Accum); | |
consolidate_nums([{N},{O}|Rem], Accum) when O == N+1 -> | |
consolidate_nums([{N,O}|Rem], Accum); | |
consolidate_nums([{N},{O}|Rem], Accum) -> | |
consolidate_nums([{O}|Rem], [{N} | Accum]); | |
consolidate_nums([{N,O},{O}|Rem], Accum) -> | |
consolidate_nums([{N,O}|Rem], Accum); | |
consolidate_nums([{N,O},{P}|Rem], Accum) when P == O+1 -> | |
consolidate_nums([{N,P}|Rem], Accum); | |
consolidate_nums([{N,O},{P}|Rem], Accum) -> | |
consolidate_nums([{P}|Rem], [{N,O} | Accum]); | |
consolidate_nums([Nums|Rem], Accum) -> | |
consolidate_nums(Rem, [Nums | Accum]). | |
consolidate_nums_test() -> | |
?assertEqual([], consolidate_nums([])), | |
?assertEqual([{1}], consolidate_nums([{1}])), | |
?assertEqual([{1}], consolidate_nums([{1},{1}])), | |
?assertEqual([{1},{3}], consolidate_nums([{1},{3}])), | |
?assertEqual([{1,2}], consolidate_nums([{1},{2}])), | |
?assertEqual([{1,2}], consolidate_nums([{1},{2},{2}])), | |
?assertEqual([{1,4}], consolidate_nums([{1},{2},{2},{3},{4},{4}])), | |
?assertEqual([{1,3},{5}], consolidate_nums([{1},{2},{3},{5}])), | |
?assertEqual([{1,3},{5},{7,9},{12}], | |
consolidate_nums([{1},{2},{3},{5},{7},{8},{9},{12}])), | |
% Numbers must be consecutive! | |
?assertEqual([{1},{3},{2}], consolidate_nums([{1},{3},{2}])). | |
%%%%%%%%%%%%%%%%% Supplied utility functions %%%%%%%%%%%%%%% | |
% Used to read a file into a list of lines. | |
% Example files available in: | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
get_file_contents(Name) -> | |
{ok,File} = file:open(Name,[read]), | |
Rev = get_all_lines(File,[]), | |
lists:reverse(Rev). | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
get_all_lines(File,Partial) -> | |
case io:get_line(File,"") of | |
eof -> file:close(File), | |
Partial; | |
Line -> {Strip,_} = lists:split(length(Line)-1,Line), | |
get_all_lines(File,[Strip|Partial]) | |
end. | |
% Show the contents of a list of strings. | |
% Can be used to check the results of calling get_file_contents. | |
show_file_contents([L|Ls]) -> | |
io:format("~s~n",[L]), | |
show_file_contents(Ls); | |
show_file_contents([]) -> | |
ok. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment