Skip to content

Instantly share code, notes, and snippets.

@fjpse
Created May 19, 2020 22:06
Show Gist options
  • Save fjpse/3fcf0e04960234a25ceddeb9c8eb94b4 to your computer and use it in GitHub Desktop.
Save fjpse/3fcf0e04960234a25ceddeb9c8eb94b4 to your computer and use it in GitHub Desktop.
-module(index).
-export([
process_file/2
]).
%%
%% process_file(InputFile, OutputFile) -> Index
%%
%% this funcion does the next:
%% a) reads a text form the file InputFile
%% b) processes this text creating an index of words with the lines where this words appear.
%% c) writes the indext to the file OutputFile
%% d) returns the created index.
%%
process_file(InputFile, OutputFile) ->
Text = read_from_file(InputFile),
Index = process_text(Text),
write_to_file(OutputFile, Index),
Index.
%%
%% process_text(Text) -> Index
%%
%% returns the sorted Index generated by process_text/3. Text is a list of lines
%%
% TUPLELIST
% process_text(Text) -> lists:keysort(1, process_text(Text, 1, [])).
% DICT
process_text(Text) -> lists:keysort(1, dict:to_list(process_text(Text, 1, dict:new()))).
% ORDDICT
% process_text(Text) -> orddict:to_list(process_text(Text, 1, orddict:new())).
%%
%% process_test(Text, LineNumber, Index) -> NewIndex
%%
%% This function process each line, incrementing the line number for each line processed.
%% The information obtained is added to Index.
%%
%% When there is no more lines, the funtion finishes returning a new Index.
%%
process_text([], _, Index) -> Index;
process_text([Line|Lines], LineNumber, Index) ->
NewIndex = process_line(Line, LineNumber, Index),
process_text(Lines, LineNumber + 1, NewIndex).
%%
%% process_line(Line, LineNumber, Index) -> NewIndex
%%
%% This function splits a the Line into words a processes each word.
%% A NewIndex is generated adding the information of the words to the Index.
%%
process_line(Line, LineNumber, Index) ->
process_words(string:lexemes(Line, " \r\n\t.,;:-`´\"'!?()[]{}\\/"), LineNumber, Index).
%%
%% process_words(Words LineNumber, Index) -> NewIndex
%%
%% This function processes each word of the list Words. If the word has 3 or less letters, the word is ignored.
%% Otherwise, the function does the next:
%% a) transforms the word to a lowercase one
%% b) finds the word in the Index.
%% c) if the word exists in the index, adds LineNumber to the list of LineNumbers associated
%% to this word and creates a NewIndex with this information.
%% d) if the word does not exists in the index, add a new entry to it, generating a NewIndex.
%% When there is no more words to processes, returns the new Index.
%%
process_words([], _, Index) -> Index;
process_words([Word|Words], LineNumber, Index) when length(Word) > 3 ->
LowerCaseWord = string:lowercase(Word),
% TUPLELIST
%case lists:keyfind(LowerCaseWord, 1, Index) of
% {LowerCaseWord, LineNumbers} ->
% NewIndex = lists:keyreplace(LowerCaseWord, 1, Index, {LowerCaseWord, [LineNumber|LineNumbers]}),
% process_words(Words, LineNumber, NewIndex);
% false ->
% process_words(Words, LineNumber, [{LowerCaseWord, [LineNumber]} | Index])
%end;
% DICT
case dict:find(LowerCaseWord, Index) of
{ok, _ } ->
NewIndex = dict:append(LowerCaseWord, LineNumber, Index),
process_words(Words, LineNumber, NewIndex);
error ->
NewIndex = dict:store(LowerCaseWord, [LineNumber], Index),
process_words(Words, LineNumber, NewIndex)
end;
% ORDDICT
%case orddict:find(LowerCaseWord, Index) of
% {ok, _ } ->
% NewIndex = orddict:append(LowerCaseWord, LineNumber, Index),
% process_words(Words, LineNumber, NewIndex);
% error ->
% NewIndex = orddict:store(LowerCaseWord, [LineNumber], Index),
% process_words(Words, LineNumber, NewIndex)
%end;
process_words([_Word|Words], LineNumber, Index) ->
process_words(Words, LineNumber, Index).
%%
%% read_from_line(FileName) -> [Lines]
%%
%% reads line to line the file FileName and returns the list of lines.
%%
read_from_file(FileName) ->
{ok, File} = file:open(FileName, [read]),
Text = read_lines(File, []),
file:close(File),
Text.
%%
%% read_lines(File, Text) -> Text
%%
%% reads a new line from File and adds it to Text until there is no more lines in the file
%%
read_lines(File, Text) ->
case io:get_line(File,"") of
eof ->
lists:reverse(Text);
Line ->
read_lines(File, [Line|Text])
end.
%%
%% write_to_file(FileName, Index) -> ok
%%
%% writes to the file FileName all the entries in Index, each into a line.
%%
%% NOTES: the content of the file FileName is overwritten if exists.
%%
write_to_file(Name, Index) ->
{ok, File} = file:open(Name, [write]),
write_index(File, Index),
file:close(File).
%%
%% write_index(File, Index) -> ok
%%
%% write an entry in Index to file File until there is no more entries in the Index.
%%
write_index(_File, []) -> ok;
write_index(File, [Entry|Entries]) ->
write_entry(File, Entry),
write_index(File, Entries).
%%
%% write_entry(File, IndexEntry) -> ok
%%
%% writes one index entry into a file into one line with the next format:
%% <word> [ <line>, <line>, ...]
%%
%% NOTE: I don't use the control sequece ~w to avoid some intergers to be printed as ASCII characters.
%%
write_entry(File, {Word, LineNumbers}) ->
io:fwrite(File, "~s ", [Word]),
write_numbers(File, LineNumbers),
io:fwrite(File, "~n", []).
%%
%% write_nubmers(File, LineNumbes) -> ok
%%
%% writes to the file the line numbers as integers separated by commas
%%
write_numbers(File, [LineNumber]) ->
io:fwrite(File, "~b", [LineNumber]);
write_numbers(File, [LineNumber|LineNumbers]) ->
io:fwrite(File, "~b,", [LineNumber]),
write_numbers(File, LineNumbers).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment