adhalanay/index_hw.erl

## index_hw.erl
-module(index_hw).
-export([get_file_contents/1,show_file_contents/1,main/1]).

% Used to read a file into a list of lines.
% Example files available in:
%   gettysburg-address.txt (short)
%   dickens-christmas.txt  (long)


% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.

get_file_contents(Name) ->
    {ok,File} = file:open(Name,[read]),
    Rev = get_all_lines(File,[]),
lists:reverse(Rev).

% Auxiliary function for get_file_contents.
% Not exported.

get_all_lines(File,Partial) ->
    case io:get_line(File,"") of
        eof -> file:close(File),
               Partial;
        Line -> {Strip,_} = lists:split(length(Line)-1,Line),
                get_all_lines(File,[Strip|Partial])
    end.

% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.

show_file_contents([L|Ls]) ->
    io:format("~s~n",[L]),
    show_file_contents(Ls);
 show_file_contents([]) ->
    ok.

% take only relevant words, remove punctuation and convevert to lowercase
get_words(X)->
    Xs = string:lexemes(string:lowercase(X)," .,;:!?=-+--"),
    [Y || Y <-Xs, string:len(Y) >= 3, Y/="and", Y/="the", Y/="that", Y/="are",Y/="this", Y/="all", Y/="now",Y/="but",Y/="can"].
% As there are some important three letter words I manually add some common  words to be not considered.

% Create an index of words for each line.

add_to_index([], _, Acc) ->
    Acc;
add_to_index([X|Xs],Poz,Acc) ->
    add_to_index(Xs, Poz, [{X,Poz} | Acc]).

%  This function is called for every line. Create a list with elements {"word",Poz}

count_words(Line,Poz,Acc)->
    Word_List=get_words(Line),
    add_to_index(Word_List,Poz, Acc).

% This function walks over lines and calls the previous function for eaxh one

process_lines(Xs)->
    process_lines_aux(Xs,0,[]).
process_lines_aux([],_Poz,Acc)->
    lists:sort(Acc);
process_lines_aux([X|Xs],Poz,Acc) ->
    Next_Poz = Poz+1,
    process_lines_aux(Xs,Next_Poz,count_words(X,Next_Poz,Acc)).

% The fundamental function which takes a list as produced by process_lines/1 and produces the required list. It uses the auxiliary function make_ranges/1,

collect_words_sorted(Words)->
    lists:sort(collect_words(Words,[],[])). % Use lexicographic ordering on words
collect_words([{Word,Poz}],Acc,Index)->
    [{Word,make_ranges(lists:sort([Poz | Acc]))}|Index];
collect_words([{Word,Poz1},{Word,_Poz2}|Rest],Acc,Index) ->
    collect_words(Rest,[Poz1|Acc],Index);
collect_words([{Word1,Poz1},{_Word2,_Poz2}|Rest],Acc,Index) ->
    Index0 = make_ranges([Poz1|Acc]),
    collect_words(Rest,[],[{Word1,Index0}|Index]).
% Converts a list of numbers to ranges like [1] -> {1,1},[1,2,3]->{1,3}etc.
make_ranges(Xs) ->
     make_ranges_sorted(lists:sort(Xs),[]). % first sort the list
make_ranges_sorted([],Acc)->
    lists:reverse(Acc);
make_ranges_sorted([X|Xs],[{X0}|Acc]) when X == X0+1 ->
    make_ranges_sorted(Xs,[{X0,X}|Acc]);
make_ranges_sorted([X|Xs],[{X0,X}|Acc]) ->
    make_ranges_sorted(Xs,[{X0,X}|Acc]);
make_ranges_sorted([X|Xs],[{X0,X1}|Acc]) when X==X1+1->
    make_ranges_sorted(Xs,[{X0,X}|Acc]);
make_ranges_sorted([X|Xs],Acc) ->
    make_ranges_sorted(Xs,[{X,X}|Acc]).

% Gluing it all together

main(Name)->
    L = get_file_contents(Name),
    collect_words_sorted(process_lines(L)).

% This approach does not seem optimal. I believie it is possible to build the index in one passing over the list of lines, but I
% was unable to do it.
	-module(index_hw).
	-export([get_file_contents/1,show_file_contents/1,main/1]).

	% Used to read a file into a list of lines.
	% Example files available in:
	% gettysburg-address.txt (short)
	% dickens-christmas.txt (long)


	% Get the contents of a text file into a list of lines.
	% Each line has its trailing newline removed.

	get_file_contents(Name) ->
	{ok,File} = file:open(Name,[read]),
	Rev = get_all_lines(File,[]),
	lists:reverse(Rev).

	% Auxiliary function for get_file_contents.
	% Not exported.

	get_all_lines(File,Partial) ->
	case io:get_line(File,"") of
	eof -> file:close(File),
	Partial;
	Line -> {Strip,_} = lists:split(length(Line)-1,Line),
	get_all_lines(File,[Strip\|Partial])
	end.

	% Show the contents of a list of strings.
	% Can be used to check the results of calling get_file_contents.

	show_file_contents([L\|Ls]) ->
	io:format("~s~n",[L]),
	show_file_contents(Ls);
	show_file_contents([]) ->
	ok.

	% take only relevant words, remove punctuation and convevert to lowercase
	get_words(X)->
	Xs = string:lexemes(string:lowercase(X)," .,;:!?=-+--"),
	[Y \|\| Y <-Xs, string:len(Y) >= 3, Y/="and", Y/="the", Y/="that", Y/="are",Y/="this", Y/="all", Y/="now",Y/="but",Y/="can"].
	% As there are some important three letter words I manually add some common words to be not considered.

	% Create an index of words for each line.

	add_to_index([], _, Acc) ->
	Acc;
	add_to_index([X\|Xs],Poz,Acc) ->
	add_to_index(Xs, Poz, [{X,Poz} \| Acc]).

	% This function is called for every line. Create a list with elements {"word",Poz}

	count_words(Line,Poz,Acc)->
	Word_List=get_words(Line),
	add_to_index(Word_List,Poz, Acc).

	% This function walks over lines and calls the previous function for eaxh one

	process_lines(Xs)->
	process_lines_aux(Xs,0,[]).
	process_lines_aux([],_Poz,Acc)->
	lists:sort(Acc);
	process_lines_aux([X\|Xs],Poz,Acc) ->
	Next_Poz = Poz+1,
	process_lines_aux(Xs,Next_Poz,count_words(X,Next_Poz,Acc)).

	% The fundamental function which takes a list as produced by process_lines/1 and produces the required list. It uses the auxiliary function make_ranges/1,

	collect_words_sorted(Words)->
	lists:sort(collect_words(Words,[],[])). % Use lexicographic ordering on words
	collect_words([{Word,Poz}],Acc,Index)->
	[{Word,make_ranges(lists:sort([Poz \| Acc]))}\|Index];
	collect_words([{Word,Poz1},{Word,_Poz2}\|Rest],Acc,Index) ->
	collect_words(Rest,[Poz1\|Acc],Index);
	collect_words([{Word1,Poz1},{_Word2,_Poz2}\|Rest],Acc,Index) ->
	Index0 = make_ranges([Poz1\|Acc]),
	collect_words(Rest,[],[{Word1,Index0}\|Index]).
	% Converts a list of numbers to ranges like [1] -> {1,1},[1,2,3]->{1,3}etc.
	make_ranges(Xs) ->
	make_ranges_sorted(lists:sort(Xs),[]). % first sort the list
	make_ranges_sorted([],Acc)->
	lists:reverse(Acc);
	make_ranges_sorted([X\|Xs],[{X0}\|Acc]) when X == X0+1 ->
	make_ranges_sorted(Xs,[{X0,X}\|Acc]);
	make_ranges_sorted([X\|Xs],[{X0,X}\|Acc]) ->
	make_ranges_sorted(Xs,[{X0,X}\|Acc]);
	make_ranges_sorted([X\|Xs],[{X0,X1}\|Acc]) when X==X1+1->
	make_ranges_sorted(Xs,[{X0,X}\|Acc]);
	make_ranges_sorted([X\|Xs],Acc) ->
	make_ranges_sorted(Xs,[{X,X}\|Acc]).

	% Gluing it all together

	main(Name)->
	L = get_file_contents(Name),
	collect_words_sorted(process_lines(L)).

	% This approach does not seem optimal. I believie it is possible to build the index in one passing over the list of lines, but I
	% was unable to do it.