Last active
May 24, 2020 13:15
-
-
Save adhalanay/5bfd4f0b7e8e97a46277f17bb0daabb0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(index_hw). | |
-export([get_file_contents/1,show_file_contents/1,main/1]). | |
% Used to read a file into a list of lines. | |
% Example files available in: | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
get_file_contents(Name) -> | |
{ok,File} = file:open(Name,[read]), | |
Rev = get_all_lines(File,[]), | |
lists:reverse(Rev). | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
get_all_lines(File,Partial) -> | |
case io:get_line(File,"") of | |
eof -> file:close(File), | |
Partial; | |
Line -> {Strip,_} = lists:split(length(Line)-1,Line), | |
get_all_lines(File,[Strip|Partial]) | |
end. | |
% Show the contents of a list of strings. | |
% Can be used to check the results of calling get_file_contents. | |
show_file_contents([L|Ls]) -> | |
io:format("~s~n",[L]), | |
show_file_contents(Ls); | |
show_file_contents([]) -> | |
ok. | |
% take only relevant words, remove punctuation and convevert to lowercase | |
get_words(X)-> | |
Xs = string:lexemes(string:lowercase(X)," .,;:!?=-+--"), | |
[Y || Y <-Xs, string:len(Y) >= 3, Y/="and", Y/="the", Y/="that", Y/="are",Y/="this", Y/="all", Y/="now",Y/="but",Y/="can"]. | |
% As there are some important three letter words I manually add some common words to be not considered. | |
% Create an index of words for each line. | |
add_to_index([], _, Acc) -> | |
Acc; | |
add_to_index([X|Xs],Poz,Acc) -> | |
add_to_index(Xs, Poz, [{X,Poz} | Acc]). | |
% This function is called for every line. Create a list with elements {"word",Poz} | |
count_words(Line,Poz,Acc)-> | |
Word_List=get_words(Line), | |
add_to_index(Word_List,Poz, Acc). | |
% This function walks over lines and calls the previous function for eaxh one | |
process_lines(Xs)-> | |
process_lines_aux(Xs,0,[]). | |
process_lines_aux([],_Poz,Acc)-> | |
lists:sort(Acc); | |
process_lines_aux([X|Xs],Poz,Acc) -> | |
Next_Poz = Poz+1, | |
process_lines_aux(Xs,Next_Poz,count_words(X,Next_Poz,Acc)). | |
% The fundamental function which takes a list as produced by process_lines/1 and produces the required list. It uses the auxiliary function make_ranges/1, | |
collect_words_sorted(Words)-> | |
lists:sort(collect_words(Words,[],[])). % Use lexicographic ordering on words | |
collect_words([{Word,Poz}],Acc,Index)-> | |
[{Word,make_ranges(lists:sort([Poz | Acc]))}|Index]; | |
collect_words([{Word,Poz1},{Word,_Poz2}|Rest],Acc,Index) -> | |
collect_words(Rest,[Poz1|Acc],Index); | |
collect_words([{Word1,Poz1},{_Word2,_Poz2}|Rest],Acc,Index) -> | |
Index0 = make_ranges([Poz1|Acc]), | |
collect_words(Rest,[],[{Word1,Index0}|Index]). | |
% Converts a list of numbers to ranges like [1] -> {1,1},[1,2,3]->{1,3}etc. | |
make_ranges(Xs) -> | |
make_ranges_sorted(lists:sort(Xs),[]). % first sort the list | |
make_ranges_sorted([],Acc)-> | |
lists:reverse(Acc); | |
make_ranges_sorted([X|Xs],[{X0}|Acc]) when X == X0+1 -> | |
make_ranges_sorted(Xs,[{X0,X}|Acc]); | |
make_ranges_sorted([X|Xs],[{X0,X}|Acc]) -> | |
make_ranges_sorted(Xs,[{X0,X}|Acc]); | |
make_ranges_sorted([X|Xs],[{X0,X1}|Acc]) when X==X1+1-> | |
make_ranges_sorted(Xs,[{X0,X}|Acc]); | |
make_ranges_sorted([X|Xs],Acc) -> | |
make_ranges_sorted(Xs,[{X,X}|Acc]). | |
% Gluing it all together | |
main(Name)-> | |
L = get_file_contents(Name), | |
collect_words_sorted(process_lines(L)). | |
% This approach does not seem optimal. I believie it is possible to build the index in one passing over the list of lines, but I | |
% was unable to do it. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment