Skip to content

Instantly share code, notes, and snippets.

@stevenproctor
Created March 10, 2017 06:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stevenproctor/2c49f69927b2a6b6e68c37888746e891 to your computer and use it in GitHub Desktop.
Save stevenproctor/2c49f69927b2a6b6e68c37888746e891 to your computer and use it in GitHub Desktop.
FutureLearn Introduction to Erlang MOOC - WEEK 2: PROGRAMMING CHALLENGE: INDEXING A FILE ASSIGNMENT
-module(index).
-export([get_file_contents/1,
show_file_contents/1,
main/1]).
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
main(Name) ->
Contents = get_file_contents(Name),
Index = index_lines(Contents, 1, maps:new()),
Index1 = maps:fold(fun(K, V, Accum) -> maps:put(K, group_line_numbers(V), Accum) end,
maps:new(),
Index),
lists:keysort(1, maps:to_list(Index1)).
index_lines([], _LineNumber, Index) ->
Index;
index_lines([Line|Lines], LineNumber, Index) ->
Words = words_in(Line),
Index1 = index_words(Words, LineNumber, Index),
index_lines(Lines, LineNumber+1, Index1).
words_in(Line) ->
string:tokens(Line, " \t\n.,\\").
index_words([], _LineNumber, Index) ->
Index;
index_words([Word|Words], LineNumber, Index) ->
Index1 = index_word(Word, LineNumber, Index),
index_words(Words, LineNumber, Index1).
index_word(Word, LineNumber, Index) ->
Word1 = normalize(Word),
case should_index_word(Word1) of
true -> maps:update_with(Word1,
fun(Lines) -> [LineNumber|Lines] end,
[LineNumber],
Index);
_ -> Index
end.
should_index_word(Word) ->
(not is_short_word(Word)) andalso (not is_common_word(Word)).
is_short_word(Word) ->
string:len(Word) < 3.
is_common_word(Word) ->
lists:member(Word, common_words()).
% List of top 100 common words according to
% http://www.duboislc.org/ED-Watch/Words/1-100.html
common_words() ->
["the", "of", "and", "a", "to", "in",
"is", "you", "that", "it", "he", "was",
"for", "on", "are", "as", "with", "his",
"they", "I", "at", "be", "this", "have",
"from", "or", "one", "had", "by", "word",
"but", "not", "what", "all", "were", "we",
"when", "your", "can", "said", "there",
"use", "an", "each", "which", "she", "do",
"how", "their", "if", "will", "up", "other",
"about", "out", "many", "then", "them",
"these", "so", "some", "her", "would", "make",
"like", "him", "into", "time", "has", "look",
"two", "more", "write", "go", "see", "number",
"no", "way", "could", "people", "my", "than",
"first", "water", "been", "call", "who",
"oil", "its", "now", "find", "long", "down",
"day", "did", "get", "come", "made", "may", "part"].
normalize(Word) ->
Word1 = string:to_lower(Word),
dumb_lemmatizer(Word1).
% Very, Very, Very dumb (and generally incorrect english)
dumb_lemmatizer(Word) ->
Word1 = re:replace(Word, "ed$", "e", [{return, list}]),
re:replace(Word1, "es$", "e", [{return, list}]).
group_line_numbers(LineNumbers) ->
group_line_numbers(LineNumbers, []).
% Walk through a descending list of line numbers
% and group them in to consecutive runs.
%
% By walking backwards though the list and building it up
% we then result in an ascending list of page number
% grouping tuples.
group_line_numbers([], Groupings) ->
Groupings;
group_line_numbers([Line|Lines], []) ->
group_line_numbers(Lines, [{Line,Line}]);
group_line_numbers([Line|Lines], [{Start, End}|Groupings]) ->
case Line =:= (Start - 1) of
true -> group_line_numbers(Lines, [{Line, End}|Groupings]);
_ -> group_line_numbers(Lines, [{Line, Line}, {Start, End} | Groupings])
end.
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment