Skip to content

Instantly share code, notes, and snippets.

@pppillai
Created May 19, 2020 17:24
Show Gist options
  • Save pppillai/7981c32bc12c5b95907b54fe180300c5 to your computer and use it in GitHub Desktop.
Save pppillai/7981c32bc12c5b95907b54fe180300c5 to your computer and use it in GitHub Desktop.
-module(index).
-compile([export_all]).
-export([index_text/1]).
-define(MINLENGTH, 8).
-include_lib("eunit/include/eunit.hrl").
%% to run c(index).
%% index:index_text("FullPathOfFile").
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
index_text(FilePath) ->
%% make lower case of all the text.
%% add this if empty line should be removed , length(X) > 0, but will screw up index.
FileContents = [lower_case(X)|| X <- get_file_contents(FilePath)],
%% make a list of tuples [{lines, linenumber},....]
ProcessedLines = lists:zip(FileContents, lists:seq(1, length(FileContents))),
%% make a list of all the words with line number
AllWords = [{Y, [N]}||{Line, N} <- ProcessedLines, Y <- string:tokens(Line,"/[]{}!;:_-,. "), length(Y) > ?MINLENGTH],
%% collect all line numbers for a word and deduplicate line number.
FinalIndex = [{Word, deduplicate(N, [])} || {Word, N} <- finalize_index(AllWords, [])],
%% make the tuple range as described in the problem statement
[{Word, lists:reverse(make_tuple_range([X|T], X, []))} || {Word,[X|T]} <- FinalIndex].
finalize_index([], Result) ->
Result;
finalize_index([{Word, LineCountList}|Tail], Result) ->
case lists:keymember(Word, 1, Result) of
false ->
finalize_index(Tail, [{Word, LineCountList}|Result]);
true ->
Value = lists:keysearch(Word, 1, Result),
case Value of
false ->
finalize_index(Tail, [{Word, LineCountList}|Result]);
{value, {Word, NumberList}} ->
NewResult = lists:keyreplace(Word, 1, Result, {Word, NumberList++LineCountList}),
finalize_index(Tail, NewResult)
end
end.
lower_case([]) ->
[];
lower_case([Char|Tail]) ->
C = case lists:member(Char, " -_,.;:\"\'") of
true ->
Char;
false ->
case Char >= $A andalso Char =< $Z of
true ->
Char + 32;
false ->
Char
end
end,
[C|lower_case(Tail)].
deduplicate([], Result) ->
lists:reverse(Result);
deduplicate([X|Xs], Result) ->
case lists:member(X, Result) of
true ->
deduplicate(Xs, Result);
false ->
deduplicate(Xs, [X|Result])
end.
make_tuple_range([], _FirstValue, Result) ->
Result;
make_tuple_range([X], FirstValue, Result) ->
[{FirstValue, X}|Result];
make_tuple_range([X1,X2|T], FirstValue, Result) ->
case X2 - X1 == 1 of
true ->
make_tuple_range([X2|T], FirstValue, Result);
false ->
make_tuple_range([X2|T], X2, [{FirstValue, X1}|Result])
end.
@fjpse
Copy link

fjpse commented May 19, 2020

I love your code. I love list comprehensions. Comments are brief but very clear and usefull.
Only twocomments. I think you can make "deduplicate" at the same time as "make_tuple_range", and why not to use string:lowercase?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment