Skip to content

Instantly share code, notes, and snippets.

@gorkaio
Last active May 19, 2020 17:25
Show Gist options
  • Save gorkaio/bd7fbf814f266dc8efd11e199abcd5c7 to your computer and use it in GitHub Desktop.
Save gorkaio/bd7fbf814f266dc8efd11e199abcd5c7 to your computer and use it in GitHub Desktop.
-module(index).
-export([get_file_contents/1,show_file_contents/1,index/1,index/2]).
-export([split_words_test/0,filter_words_test/0,normalise_words_test/0,words_test/0,index_line_test/0,index_test/0,add_words_test/0,range_test/0]).
-define(MIN_WORD_LENGTH, 2).
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
%% Assignment %%%
%% Generate word index from lines
index(L) -> index(L, []).
index(L, ExcludeWords) -> index(L, ExcludeWords, 1, []).
index([], _, _, Ac) ->
RangedWordLines = lists:map(fun({W,WL}) -> {W, range(WL)} end, Ac),
lists:keysort(1, RangedWordLines);
index([H|T], ExcludeWords, LineCount, Ac) ->
L = index_line(H, ExcludeWords, LineCount),
index(T, ExcludeWords, LineCount + 1, add_words(L, Ac)).
index_test() ->
[] = index([]),
[] = index([""]),
[] = index(["doh!"], ["doh"]),
[{"hello", [{1,1}]}] = index(["hello world!"], ["world"]),
[{"hello",[{1,1}]},{"world",[{1,1}]}] = index(["hello world!"]),
[{"hello",[{1,2}]},{"universe",[{2,2}]},{"world",[{1,1}]}] = index(["hello world", "hello universe"]),
[{"baby", [{1,1},{3,3}]},{"hello",[{1,3}]},{"universe",[{3,3}]}] = index(["hello baby", "hello world", "hello baby universe"], ["world"]),
[{"universe",[{2,2}]}, {"world",[{1,1}]}] = index(["hello world", "hello universe"], ["hello"]),
passed.
%% Index a single line
index_line(S, ExcludeWords, LineCount) ->
lists:map(fun(W) -> {W, [LineCount]} end, words(S, ExcludeWords)).
index_line_test() ->
[] = index_line("", [], 2),
[{"foo", [2]}, {"nice", [2]}, {"word", [2]}] = index_line("foo is a nice word! foo! foo!", [], 2),
passed.
% Process words: split, normalise and filter
words(S) -> words(S, []).
words(S, ExcludeWords) ->
Words = split_words(S),
Normalised = normalise_words(Words),
filter_words(Normalised, ExcludeWords).
words_test() ->
[] = words(""),
[] = words(" "),
[] = words("me and I", ["and"]),
["about","begin","dead","doubt","marley","that","there","was","whatever","with"] =
words("Marley was dead: to begin with. There is no doubt whatever about that."),
["about","begin","dead","doubt","marley","there","was","whatever"] =
words("Marley was dead: to begin with. There is no doubt whatever about that.", ["that","with"]),
passed.
% Split strings into list of words
split_words(S) ->
L = re:split(S, "[[:^alnum:]]", [{return,list}]),
lists:filter(fun(A) -> A /= [] end, L).
split_words_test() ->
[] = split_words(""),
[] = split_words(" "),
["hello", "world"] = split_words("hello world"),
["hello", "world"] = split_words("hello 'world'"),
["December", "1843", "Stave", "1", "Marley", "s", "Ghost"] = split_words("December, 1843. Stave 1: Marley's Ghost"),
passed.
% Normalise words
normalise_words(L) ->
lists:map(fun(A) -> string:lowercase(A) end, L).
normalise_words_test() ->
[] = normalise_words([]),
["december", "1843", "stave", "1", "marley", "s", "ghost"] = normalise_words(["DeCeMber", "1843", "STAVE", "1", "MarleY", "s", "Ghost"]),
passed.
% filter words in a list:
% remove words shorter than MIN_WORD_LENGTH and optionally exclude some words
filter_words(L) -> filter_words(L, []).
filter_words(L, Exclude) ->
Words = lists:filter(fun(A) -> length(A) > ?MIN_WORD_LENGTH end, L) -- Exclude,
lists:usort(Words). % Sorts words and removes duplicates
filter_words_test() ->
[] = filter_words([]),
["love", "you"] = filter_words(["it", "is", "you", "my", "love"]),
["1843","December","Ghost","Marley","Stave"] = filter_words(["December", "1843", "Stave", "1", "Marley", "s", "Ghost"]),
["love", "you"] = filter_words(["it", "is", "you", "my", "love"], []),
["you"] = filter_words(["it", "is", "you", "my", "love"], ["love"]),
["1843","December","Stave"] = filter_words(["December", "1843", "Stave", "1", "Marley", "s", "Ghost"], ["Marley", "Ghost"]),
["1843","December","Stave"] = filter_words(["December", "1843", "December", "Stave", "1", "Marley", "s", "Ghost"], ["Marley", "Ghost"]),
passed.
%% Add words to index
add_words([],Index) -> Index;
add_words([{Word,WordLines} = W|L], Index) ->
IndexNew = case lists:keytake(Word, 1, Index) of
{value, {Word, WordLinesPre}, IndexRest} -> [{Word, lists:usort(WordLines ++ WordLinesPre)}] ++ IndexRest;
_ -> [W|Index]
end,
add_words(L, IndexNew).
add_words_test() ->
[] = add_words([], []),
[{"hello", [1]}] = add_words([],[{"hello", [1]}]),
[{"hello",[1]},{"bye",[2]}] = add_words([{"hello", [1]}], [{"bye", [2]}]),
[{"hello",[1,2]}] = add_words([{"hello", [1]}], [{"hello", [2]}]),
[{"hello",[1,2]},{"bye", [3]}] = add_words([{"hello", [1]}], [{"bye", [3]}, {"hello", [2]}]),
[{"bye", [1,3]},{"hello",[1,2]}] = add_words([{"hello", [1]}, {"bye", [1]}], [{"bye", [3]}, {"hello", [2]}]),
[{"bye", [1]},{"hello",[1,2]}] = add_words([{"hello", [1]}, {"bye", [1]}], [{"bye", [1]}, {"hello", [2]}]),
passed.
%% Generate range
range([]) -> [];
range([A|L]) -> range(L, A, A, []).
range([], Last, Start, Ac) -> lists:reverse([{Start,Last}] ++ Ac);
range([A|T], Last, Start, Ac) when Last + 1 == A ->
range(T, A, Start, Ac);
range([A|T], Last, Start, Ac) ->
range(T, A, A, [{Start,Last}] ++ Ac).
range_test() ->
[] = range([]),
[{1,1}] = range([1]),
[{1,2}] = range([1,2]),
[{1,1}, {4,4}] = range([1,4]),
[{1,4}] = range([1,2,3,4]),
[{1,1}, {3,5}, {7,7}, {9,10}] = range([1,3,4,5,7,9,10]),
passed.
@pppillai
Copy link

nice

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment