gorkaio/index.erl

## index.erl
-module(index).
-export([get_file_contents/1,show_file_contents/1,index/1,index/2]).
-export([split_words_test/0,filter_words_test/0,normalise_words_test/0,words_test/0,index_line_test/0,index_test/0,add_words_test/0,range_test/0]).

-define(MIN_WORD_LENGTH, 2).

% Used to read a file into a list of lines.
% Example files available in:
%   gettysburg-address.txt (short)
%   dickens-christmas.txt  (long)


% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.

get_file_contents(Name) ->
    {ok,File} = file:open(Name,[read]),
    Rev = get_all_lines(File,[]),
    lists:reverse(Rev).

% Auxiliary function for get_file_contents.
% Not exported.

get_all_lines(File,Partial) ->
    case io:get_line(File,"") of
        eof -> file:close(File),
               Partial;
        Line -> {Strip,_} = lists:split(length(Line)-1,Line),
                get_all_lines(File,[Strip|Partial])
    end.

% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.

show_file_contents([L|Ls]) ->
    io:format("~s~n",[L]),
    show_file_contents(Ls);
 show_file_contents([]) ->
    ok.

%% Assignment %%%

%% Generate word index from lines

index(L) -> index(L, []).
index(L, ExcludeWords) -> index(L, ExcludeWords, 1, []).
index([], _, _, Ac) ->
    RangedWordLines = lists:map(fun({W,WL}) -> {W, range(WL)} end, Ac),
    lists:keysort(1, RangedWordLines);
index([H|T], ExcludeWords, LineCount, Ac) ->
    L = index_line(H, ExcludeWords, LineCount),
    index(T, ExcludeWords, LineCount + 1, add_words(L, Ac)).

index_test() ->
    [] = index([]),
    [] = index([""]),
    [] = index(["doh!"], ["doh"]),
    [{"hello", [{1,1}]}] = index(["hello world!"], ["world"]),
    [{"hello",[{1,1}]},{"world",[{1,1}]}] = index(["hello world!"]),
    [{"hello",[{1,2}]},{"universe",[{2,2}]},{"world",[{1,1}]}] = index(["hello world", "hello universe"]),
    [{"baby", [{1,1},{3,3}]},{"hello",[{1,3}]},{"universe",[{3,3}]}] = index(["hello baby", "hello world", "hello baby universe"], ["world"]),
    [{"universe",[{2,2}]}, {"world",[{1,1}]}] = index(["hello world", "hello universe"], ["hello"]),
    passed.

%% Index a single line

index_line(S, ExcludeWords, LineCount) ->
    lists:map(fun(W) -> {W, [LineCount]} end, words(S, ExcludeWords)).

index_line_test() ->
    [] = index_line("", [], 2),
    [{"foo", [2]}, {"nice", [2]}, {"word", [2]}] = index_line("foo is a nice word! foo! foo!", [], 2),
    passed.

% Process words: split, normalise and filter

words(S) -> words(S, []).
words(S, ExcludeWords) ->
    Words = split_words(S),
    Normalised = normalise_words(Words),
    filter_words(Normalised, ExcludeWords).

words_test() ->
    [] = words(""),
    [] = words("    "),
    [] = words("me and I", ["and"]),
    ["about","begin","dead","doubt","marley","that","there","was","whatever","with"] =
        words("Marley was dead: to begin with. There is no doubt whatever about that."),
    ["about","begin","dead","doubt","marley","there","was","whatever"] =
        words("Marley was dead: to begin with. There is no doubt whatever about that.", ["that","with"]),
    passed.

% Split strings into list of words

split_words(S) ->
    L = re:split(S, "[[:^alnum:]]", [{return,list}]),
    lists:filter(fun(A) -> A /= [] end, L).

split_words_test() ->
    [] = split_words(""),
    [] = split_words("    "),
    ["hello", "world"] = split_words("hello world"),
    ["hello", "world"] = split_words("hello 'world'"),
    ["December", "1843", "Stave", "1", "Marley", "s", "Ghost"] = split_words("December, 1843. Stave 1:  Marley's Ghost"),
    passed.

% Normalise words

normalise_words(L) ->
    lists:map(fun(A) -> string:lowercase(A) end, L).

normalise_words_test() ->
    [] = normalise_words([]),
    ["december", "1843", "stave", "1", "marley", "s", "ghost"] = normalise_words(["DeCeMber", "1843", "STAVE", "1", "MarleY", "s", "Ghost"]),
    passed.

% filter words in a list:
%   remove words shorter than MIN_WORD_LENGTH and optionally exclude some words

filter_words(L) -> filter_words(L, []).
filter_words(L, Exclude) ->
    Words = lists:filter(fun(A) -> length(A) > ?MIN_WORD_LENGTH end, L) -- Exclude,
    lists:usort(Words). % Sorts words and removes duplicates

filter_words_test() ->
    [] = filter_words([]),
    ["love", "you"] = filter_words(["it", "is", "you", "my", "love"]),
    ["1843","December","Ghost","Marley","Stave"] = filter_words(["December", "1843", "Stave", "1", "Marley", "s", "Ghost"]),
    ["love", "you"] = filter_words(["it", "is", "you", "my", "love"], []),
    ["you"] = filter_words(["it", "is", "you", "my", "love"], ["love"]),
    ["1843","December","Stave"] = filter_words(["December", "1843", "Stave", "1", "Marley", "s", "Ghost"], ["Marley", "Ghost"]),
    ["1843","December","Stave"] = filter_words(["December", "1843", "December", "Stave", "1", "Marley", "s", "Ghost"], ["Marley", "Ghost"]),
    passed.

%% Add words to index

add_words([],Index) -> Index;
add_words([{Word,WordLines} = W|L], Index) ->
    IndexNew = case lists:keytake(Word, 1, Index) of
        {value, {Word, WordLinesPre}, IndexRest} -> [{Word, lists:usort(WordLines ++ WordLinesPre)}] ++ IndexRest;
        _ -> [W|Index]
    end,
    add_words(L, IndexNew).

add_words_test() ->
    [] = add_words([], []),
    [{"hello", [1]}] = add_words([],[{"hello", [1]}]),
    [{"hello",[1]},{"bye",[2]}] = add_words([{"hello", [1]}], [{"bye", [2]}]),
    [{"hello",[1,2]}] = add_words([{"hello", [1]}], [{"hello", [2]}]),
    [{"hello",[1,2]},{"bye", [3]}] = add_words([{"hello", [1]}], [{"bye", [3]}, {"hello", [2]}]),
    [{"bye", [1,3]},{"hello",[1,2]}] = add_words([{"hello", [1]}, {"bye", [1]}], [{"bye", [3]}, {"hello", [2]}]),
    [{"bye", [1]},{"hello",[1,2]}] = add_words([{"hello", [1]}, {"bye", [1]}], [{"bye", [1]}, {"hello", [2]}]),
    passed.

%% Generate range

range([]) -> [];
range([A|L]) -> range(L, A, A, []).
range([], Last, Start, Ac) -> lists:reverse([{Start,Last}] ++ Ac);
range([A|T], Last, Start, Ac) when Last + 1 == A ->
    range(T, A, Start, Ac);
range([A|T], Last, Start, Ac) ->
    range(T, A, A, [{Start,Last}] ++ Ac).

range_test() ->
    [] = range([]),
    [{1,1}] = range([1]),
    [{1,2}] = range([1,2]),
    [{1,1}, {4,4}] = range([1,4]),
    [{1,4}] = range([1,2,3,4]),
    [{1,1}, {3,5}, {7,7}, {9,10}] = range([1,3,4,5,7,9,10]),
    passed.
	-module(index).
	-export([get_file_contents/1,show_file_contents/1,index/1,index/2]).
	-export([split_words_test/0,filter_words_test/0,normalise_words_test/0,words_test/0,index_line_test/0,index_test/0,add_words_test/0,range_test/0]).

	-define(MIN_WORD_LENGTH, 2).

	% Used to read a file into a list of lines.
	% Example files available in:
	% gettysburg-address.txt (short)
	% dickens-christmas.txt (long)


	% Get the contents of a text file into a list of lines.
	% Each line has its trailing newline removed.

	get_file_contents(Name) ->
	{ok,File} = file:open(Name,[read]),
	Rev = get_all_lines(File,[]),
	lists:reverse(Rev).

	% Auxiliary function for get_file_contents.
	% Not exported.

	get_all_lines(File,Partial) ->
	case io:get_line(File,"") of
	eof -> file:close(File),
	Partial;
	Line -> {Strip,_} = lists:split(length(Line)-1,Line),
	get_all_lines(File,[Strip\|Partial])
	end.

	% Show the contents of a list of strings.
	% Can be used to check the results of calling get_file_contents.

	show_file_contents([L\|Ls]) ->
	io:format("~s~n",[L]),
	show_file_contents(Ls);
	show_file_contents([]) ->
	ok.

	%% Assignment %%%

	%% Generate word index from lines

	index(L) -> index(L, []).
	index(L, ExcludeWords) -> index(L, ExcludeWords, 1, []).
	index([], _, _, Ac) ->
	RangedWordLines = lists:map(fun({W,WL}) -> {W, range(WL)} end, Ac),
	lists:keysort(1, RangedWordLines);
	index([H\|T], ExcludeWords, LineCount, Ac) ->
	L = index_line(H, ExcludeWords, LineCount),
	index(T, ExcludeWords, LineCount + 1, add_words(L, Ac)).

	index_test() ->
	[] = index([]),
	[] = index([""]),
	[] = index(["doh!"], ["doh"]),
	[{"hello", [{1,1}]}] = index(["hello world!"], ["world"]),
	[{"hello",[{1,1}]},{"world",[{1,1}]}] = index(["hello world!"]),
	[{"hello",[{1,2}]},{"universe",[{2,2}]},{"world",[{1,1}]}] = index(["hello world", "hello universe"]),
	[{"baby", [{1,1},{3,3}]},{"hello",[{1,3}]},{"universe",[{3,3}]}] = index(["hello baby", "hello world", "hello baby universe"], ["world"]),
	[{"universe",[{2,2}]}, {"world",[{1,1}]}] = index(["hello world", "hello universe"], ["hello"]),
	passed.

	%% Index a single line

	index_line(S, ExcludeWords, LineCount) ->
	lists:map(fun(W) -> {W, [LineCount]} end, words(S, ExcludeWords)).

	index_line_test() ->
	[] = index_line("", [], 2),
	[{"foo", [2]}, {"nice", [2]}, {"word", [2]}] = index_line("foo is a nice word! foo! foo!", [], 2),
	passed.

	% Process words: split, normalise and filter

	words(S) -> words(S, []).
	words(S, ExcludeWords) ->
	Words = split_words(S),
	Normalised = normalise_words(Words),
	filter_words(Normalised, ExcludeWords).

	words_test() ->
	[] = words(""),
	[] = words(" "),
	[] = words("me and I", ["and"]),
	["about","begin","dead","doubt","marley","that","there","was","whatever","with"] =
	words("Marley was dead: to begin with. There is no doubt whatever about that."),
	["about","begin","dead","doubt","marley","there","was","whatever"] =
	words("Marley was dead: to begin with. There is no doubt whatever about that.", ["that","with"]),
	passed.

	% Split strings into list of words

	split_words(S) ->
	L = re:split(S, "[[:^alnum:]]", [{return,list}]),
	lists:filter(fun(A) -> A /= [] end, L).

	split_words_test() ->
	[] = split_words(""),
	[] = split_words(" "),
	["hello", "world"] = split_words("hello world"),
	["hello", "world"] = split_words("hello 'world'"),
	["December", "1843", "Stave", "1", "Marley", "s", "Ghost"] = split_words("December, 1843. Stave 1: Marley's Ghost"),
	passed.

	% Normalise words

	normalise_words(L) ->
	lists:map(fun(A) -> string:lowercase(A) end, L).

	normalise_words_test() ->
	[] = normalise_words([]),
	["december", "1843", "stave", "1", "marley", "s", "ghost"] = normalise_words(["DeCeMber", "1843", "STAVE", "1", "MarleY", "s", "Ghost"]),
	passed.

	% filter words in a list:
	% remove words shorter than MIN_WORD_LENGTH and optionally exclude some words

	filter_words(L) -> filter_words(L, []).
	filter_words(L, Exclude) ->
	Words = lists:filter(fun(A) -> length(A) > ?MIN_WORD_LENGTH end, L) -- Exclude,
	lists:usort(Words). % Sorts words and removes duplicates

	filter_words_test() ->
	[] = filter_words([]),
	["love", "you"] = filter_words(["it", "is", "you", "my", "love"]),
	["1843","December","Ghost","Marley","Stave"] = filter_words(["December", "1843", "Stave", "1", "Marley", "s", "Ghost"]),
	["love", "you"] = filter_words(["it", "is", "you", "my", "love"], []),
	["you"] = filter_words(["it", "is", "you", "my", "love"], ["love"]),
	["1843","December","Stave"] = filter_words(["December", "1843", "Stave", "1", "Marley", "s", "Ghost"], ["Marley", "Ghost"]),
	["1843","December","Stave"] = filter_words(["December", "1843", "December", "Stave", "1", "Marley", "s", "Ghost"], ["Marley", "Ghost"]),
	passed.

	%% Add words to index

	add_words([],Index) -> Index;
	add_words([{Word,WordLines} = W\|L], Index) ->
	IndexNew = case lists:keytake(Word, 1, Index) of
	{value, {Word, WordLinesPre}, IndexRest} -> [{Word, lists:usort(WordLines ++ WordLinesPre)}] ++ IndexRest;
	_ -> [W\|Index]
	end,
	add_words(L, IndexNew).

	add_words_test() ->
	[] = add_words([], []),
	[{"hello", [1]}] = add_words([],[{"hello", [1]}]),
	[{"hello",[1]},{"bye",[2]}] = add_words([{"hello", [1]}], [{"bye", [2]}]),
	[{"hello",[1,2]}] = add_words([{"hello", [1]}], [{"hello", [2]}]),
	[{"hello",[1,2]},{"bye", [3]}] = add_words([{"hello", [1]}], [{"bye", [3]}, {"hello", [2]}]),
	[{"bye", [1,3]},{"hello",[1,2]}] = add_words([{"hello", [1]}, {"bye", [1]}], [{"bye", [3]}, {"hello", [2]}]),
	[{"bye", [1]},{"hello",[1,2]}] = add_words([{"hello", [1]}, {"bye", [1]}], [{"bye", [1]}, {"hello", [2]}]),
	passed.

	%% Generate range

	range([]) -> [];
	range([A\|L]) -> range(L, A, A, []).
	range([], Last, Start, Ac) -> lists:reverse([{Start,Last}] ++ Ac);
	range([A\|T], Last, Start, Ac) when Last + 1 == A ->
	range(T, A, Start, Ac);
	range([A\|T], Last, Start, Ac) ->
	range(T, A, A, [{Start,Last}] ++ Ac).

	range_test() ->
	[] = range([]),
	[{1,1}] = range([1]),
	[{1,2}] = range([1,2]),
	[{1,1}, {4,4}] = range([1,4]),
	[{1,4}] = range([1,2,3,4]),
	[{1,1}, {3,5}, {7,7}, {9,10}] = range([1,3,4,5,7,9,10]),
	passed.