Skip to content

Instantly share code, notes, and snippets.

@ekalinin
Created March 3, 2017 14:14
Show Gist options
  • Save ekalinin/183ba30657f816295232edda33fdbe5c to your computer and use it in GitHub Desktop.
Save ekalinin/183ba30657f816295232edda33fdbe5c to your computer and use it in GitHub Desktop.
indexing a file
-module(index).
-compile([export_all]).
% -export([get_file_contents/1,show_file_contents/1]).
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok, File} = file:open(Name, [read]),
Rev = get_all_lines(File, []),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File, Partial) ->
case io:get_line(File, "") of
eof ->
file:close(File),
Partial;
Line ->
{Strip, _} = lists:split(length(Line)-1, Line),
get_all_lines(File, [Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
io:format("~s~n", [L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
%%%
%%% Solution
%%%
% Index a file, returns a list of tuples
% Each tuple is:
% * word
% * list of rows from a file
index_file(Filename) ->
index_lines(get_file_contents(Filename)).
% Index a list of lines
index_lines(Lines) ->
% Each state is a tuple of two elements:
% * number of line from file
% * index (as dict)
InitState = {1, dict:new()},
{_, Index} = lists:foldl(fun index_line/2, InitState, Lines),
lists:sort(dict:to_list(Index)).
% Index a line
index_line(Line, Acc={RowNumber, _Index}) ->
Words = string:tokens(Line, " "),
{_, NewIndex} = lists:foldl(fun proccess_word/2, Acc, Words),
{RowNumber+1, NewIndex}.
proccess_word(Word, Acc) ->
ClearWord = clear_word(Word),
case need_to_proccess(ClearWord) of
true -> index_word(ClearWord, Acc);
false -> Acc
end.
clear_word(Word) ->
re:replace(string:to_lower(Word),
"(\\\\)|(\')|(\")", "", [global, {return, list}]).
common_words() ->
["the", "not", "and", "for", "with", "you", "this", "but", "from", "are",
"but", "any"].
need_to_proccess(Word) ->
(length(Word) >= 3) and not(lists:member(Word, common_words())).
% Index word
index_word(Word, {RowNumber, Index}) ->
Pos = {RowNumber, RowNumber},
NewIndex = dict:update(Word,
fun(Old) -> update_word_positions(Old, Pos) end,
[Pos], Index),
{RowNumber, NewIndex}.
%
update_word_positions(Positions, CurrentPos) ->
{PrevPoses, LastPos} = lists:split(length(Positions)-1, Positions),
[{LastPosFrom, LastPosTo}] = LastPos,
{CurrPosFrom, CurrPosTo} = CurrentPos,
PrevPoses ++
if
LastPosTo == (CurrPosFrom-1);
LastPosTo == CurrPosFrom -> [{LastPosFrom, CurrPosTo}];
true -> LastPos ++ [CurrentPos]
end.
%%%
%%% Tests
%%%
index_lines_test() ->
[{"aaa", [{1, 2}]},
{"bbb", [{3, 3}]}] = index:index_lines(["aaa", "AAA", "bbb", "cc"]).
index_line_test() ->
D1 = dict:store("aaa", [{1,1}], dict:new()),
D2 = dict:store("bbb", [{1,1}], D1),
{2, D2} = index:index_line("aaa bbb", {1, dict:new()}).
index_word_1_test() ->
D1 = dict:store("aaa", [{1,1}], dict:new()),
{_, D1} = index:index_word("aaa", {1, dict:new()}).
index_word_2_test() ->
D1 = dict:store("aaa", [{1,2}], dict:new()),
RD1 = dict:store("aaa", [{1,1}], dict:new()),
{_, D1} = index:index_word("aaa", {2, RD1}).
update_word_pos_1_test() ->
[{1, 1}, {3, 3}] = update_word_positions([{1, 1}], {3, 3}).
update_word_pos_2_test() ->
[{1, 2}] = update_word_positions([{1, 1}], {2, 2}).
update_word_pos_3_test() ->
[{1, 2}] = update_word_positions([{1, 2}], {2, 2}).
➥ erl
Erlang/OTP 18 [erts-7.3.1.2] [source] [64-bit] [smp:4:4] [async-threads:10] [kernel-poll:false]
Eshell V7.3.1.2 (abort with ^G)
1> c(index).
{ok,index}
2> eunit:test(index).
All 7 tests passed.
ok
3>
3> index:index_file("gettysburg-address.txt").
[{"above",[{16,16}]},
{"add",[{16,16}]},
{"advanced.",[{20,20}]},
{"ago",[{1,1}]},
{"all",[{3,3}]},
{"altogether",[{10,10}]},
{"battle-field",[{7,7}]},
{"before",[{22,22}]},
{"birth",[{26,26}]},
{"brave",[{15,15}]},
{"brought",[{1,1}]},
{"but,",[{13,13}]},
{"can",[{7,7},{13,14},{18,18}]},
{"cause",[{23,23}]},
{"civil",[{5,5}]},
{"come",[{8,8}]},
{"conceived",[{2,2},{6,6}]},
{"consecrate",[{14,14}]},
{"consecrated",[{16,16}]},
{"continent,",[{2,2}]},
{"created",[{3,3}]},
{"dead",[{22,22},{25,25}]},
{"dead,",[{15,15}]},
{"dedicate",[{8,8},{13,...}]},
{"dedicated",[{3,...},{...}|...]},
{"dedicated,",[{...}]},
{"detract.",[...]},
{[...],...},
{...}|...]
4> q().
ok
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment