Skip to content

Instantly share code, notes, and snippets.

@adhalanay
Last active May 24, 2020 13:15
Show Gist options
  • Save adhalanay/5bfd4f0b7e8e97a46277f17bb0daabb0 to your computer and use it in GitHub Desktop.
Save adhalanay/5bfd4f0b7e8e97a46277f17bb0daabb0 to your computer and use it in GitHub Desktop.
-module(index_hw).
-export([get_file_contents/1,show_file_contents/1,main/1]).
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
% take only relevant words, remove punctuation and convevert to lowercase
get_words(X)->
Xs = string:lexemes(string:lowercase(X)," .,;:!?=-+--"),
[Y || Y <-Xs, string:len(Y) >= 3, Y/="and", Y/="the", Y/="that", Y/="are",Y/="this", Y/="all", Y/="now",Y/="but",Y/="can"].
% As there are some important three letter words I manually add some common words to be not considered.
% Create an index of words for each line.
add_to_index([], _, Acc) ->
Acc;
add_to_index([X|Xs],Poz,Acc) ->
add_to_index(Xs, Poz, [{X,Poz} | Acc]).
% This function is called for every line. Create a list with elements {"word",Poz}
count_words(Line,Poz,Acc)->
Word_List=get_words(Line),
add_to_index(Word_List,Poz, Acc).
% This function walks over lines and calls the previous function for eaxh one
process_lines(Xs)->
process_lines_aux(Xs,0,[]).
process_lines_aux([],_Poz,Acc)->
lists:sort(Acc);
process_lines_aux([X|Xs],Poz,Acc) ->
Next_Poz = Poz+1,
process_lines_aux(Xs,Next_Poz,count_words(X,Next_Poz,Acc)).
% The fundamental function which takes a list as produced by process_lines/1 and produces the required list. It uses the auxiliary function make_ranges/1,
collect_words_sorted(Words)->
lists:sort(collect_words(Words,[],[])). % Use lexicographic ordering on words
collect_words([{Word,Poz}],Acc,Index)->
[{Word,make_ranges(lists:sort([Poz | Acc]))}|Index];
collect_words([{Word,Poz1},{Word,_Poz2}|Rest],Acc,Index) ->
collect_words(Rest,[Poz1|Acc],Index);
collect_words([{Word1,Poz1},{_Word2,_Poz2}|Rest],Acc,Index) ->
Index0 = make_ranges([Poz1|Acc]),
collect_words(Rest,[],[{Word1,Index0}|Index]).
% Converts a list of numbers to ranges like [1] -> {1,1},[1,2,3]->{1,3}etc.
make_ranges(Xs) ->
make_ranges_sorted(lists:sort(Xs),[]). % first sort the list
make_ranges_sorted([],Acc)->
lists:reverse(Acc);
make_ranges_sorted([X|Xs],[{X0}|Acc]) when X == X0+1 ->
make_ranges_sorted(Xs,[{X0,X}|Acc]);
make_ranges_sorted([X|Xs],[{X0,X}|Acc]) ->
make_ranges_sorted(Xs,[{X0,X}|Acc]);
make_ranges_sorted([X|Xs],[{X0,X1}|Acc]) when X==X1+1->
make_ranges_sorted(Xs,[{X0,X}|Acc]);
make_ranges_sorted([X|Xs],Acc) ->
make_ranges_sorted(Xs,[{X,X}|Acc]).
% Gluing it all together
main(Name)->
L = get_file_contents(Name),
collect_words_sorted(process_lines(L)).
% This approach does not seem optimal. I believie it is possible to build the index in one passing over the list of lines, but I
% was unable to do it.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment