Skip to content

Instantly share code, notes, and snippets.

@yuanjs
Last active May 22, 2020 06:23
Show Gist options
  • Save yuanjs/047dc205a29094fb39faf25a483221f1 to your computer and use it in GitHub Desktop.
Save yuanjs/047dc205a29094fb39faf25a483221f1 to your computer and use it in GitHub Desktop.
erlang indexing a file
-module(index).
-export([get_file_contents/1,show_file_contents/1,
get_string_words_index/4,get_all_lines_index/1,build_range/1,
build_words_occurs/1,sort_words_index/1, get_index/1]).
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
% My functions
% 1. Call get_file_contents to get file contents in a list of all line.
% 2. Call get_all_lines_index to build a list of all words and wors occurs in which line number.
% 3. Call sort_words_index to index the list of all words by word and line number.
% 4. Finally call build_words_occurs to rearrange all the line index number into a rane of tuple.
% get all words index by filename
get_index(Filename) ->
Contents = get_file_contents(Filename),
AllWordsIndex = get_all_lines_index(Contents),
SortWordsIndex = sort_words_index(AllWordsIndex),
lists:reverse(build_words_occurs(SortWordsIndex)).
% build words index by occurs range.
build_words_occurs(Xs) ->
build_words_occurs(Xs, [], {"", 0}, []).
build_words_occurs([], Results, {PreWord, _}, CurrentIndexList) ->
[{PreWord, build_range(CurrentIndexList)} | Results];
build_words_occurs([{PreWord, PreLineNumber}|Xs], Results, {PreWord, PreLineNumber}, CurrentIndexList) ->
build_words_occurs(Xs, Results, {PreWord, PreLineNumber}, CurrentIndexList);
build_words_occurs([{PreWord, LineNumber}| Xs], Results, {PreWord, _PreLineNumber}, CurrentIndexList) ->
NewIndexList = [LineNumber|CurrentIndexList],
build_words_occurs(Xs, Results, {PreWord, LineNumber}, NewIndexList);
build_words_occurs([{NewWord, LineNumber}|Xs], Results, {"", _PreLineNumber}, CurrentIndexList) ->
NewIndexList = [LineNumber|CurrentIndexList],
build_words_occurs(Xs, Results, {NewWord, LineNumber}, NewIndexList);
build_words_occurs([{NewWord, LineNumber}|Xs], Results, {PreWord, _PreLineNumber}, CurrentIndexList) ->
NewResults = [{PreWord, build_range(CurrentIndexList)} | Results],
build_words_occurs(Xs, NewResults, {NewWord, LineNumber}, [LineNumber]).
% build index number range
build_range(Xs) ->
group_by_range(lists:reverse(Xs)).
% This is the most trickey part for me. I have to google solution from internet.
% Groups a list of numbers into contiguous ranges.
group_by_range([]) ->
[];
group_by_range([H|T]) ->
group_by_range(T, {H,H}, []).
group_by_range([], Range, Acc) ->
lists:reverse([Range|Acc]);
group_by_range([H|T], {S,E}, Acc) when E+1 == H ->
group_by_range(T, {S,H}, Acc);
group_by_range([H|T], Range={_S,_E}, Acc) ->
group_by_range(T, {H,H},[Range|Acc]).
% Sort all the words index by word and line number.
sort_words_index(Xs) ->
lists:sort(fun({Word1, LineNumber1}, {Word2, LineNumber2}) ->
Word1 < Word2 orelse LineNumber1 < LineNumber2 end, Xs).
% Get all lines word index
get_all_lines_index(Xs) ->
get_all_lines_index(Xs, [], 1).
get_all_lines_index([], WordsIndex, _LineNumber) ->
WordsIndex;
get_all_lines_index([X|Xs], WordsIndex, LineNumber) ->
get_all_lines_index(Xs, get_string_words_index(X, [], "", LineNumber) ++ WordsIndex, LineNumber + 1).
% Get All words index
get_string_words_index([], WordsIndex, "", _LineNumber) ->
WordsIndex;
get_string_words_index([], WordsIndex, CurrentWord, LineNumber) ->
if
length(CurrentWord) > 3 ->
[{lists:reverse(CurrentWord), LineNumber}|WordsIndex];
true -> WordsIndex
end;
get_string_words_index([X|Xs], WordsIndex, CurrentWord, LineNumber) when (X >= 65) and (X =< 90) ->
NewCurrentWord = [X + 32|CurrentWord],
get_string_words_index(Xs, WordsIndex, NewCurrentWord, LineNumber);
get_string_words_index([X|Xs], WordsIndex, CurrentWord, LineNumber) when (X >= 97) and (X =< 122) ->
NewCurrentWord = [X|CurrentWord],
get_string_words_index(Xs, WordsIndex, NewCurrentWord, LineNumber);
get_string_words_index([_|Xs], WordsIndex, "", LineNumber) ->
get_string_words_index(Xs, WordsIndex, "", LineNumber);
get_string_words_index([_|Xs], WordsIndex, CurrentWord, LineNumber) ->
NewWordsIndex =
if
length(CurrentWord) > 3 ->
[{lists:reverse(CurrentWord), LineNumber}|WordsIndex];
true -> WordsIndex
end,
get_string_words_index(Xs, NewWordsIndex, "", LineNumber).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment