Created
March 10, 2017 06:00
-
-
Save stevenproctor/2c49f69927b2a6b6e68c37888746e891 to your computer and use it in GitHub Desktop.
FutureLearn Introduction to Erlang MOOC - WEEK 2: PROGRAMMING CHALLENGE: INDEXING A FILE ASSIGNMENT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(index). | |
-export([get_file_contents/1, | |
show_file_contents/1, | |
main/1]). | |
% Used to read a file into a list of lines. | |
% Example files available in: | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
main(Name) -> | |
Contents = get_file_contents(Name), | |
Index = index_lines(Contents, 1, maps:new()), | |
Index1 = maps:fold(fun(K, V, Accum) -> maps:put(K, group_line_numbers(V), Accum) end, | |
maps:new(), | |
Index), | |
lists:keysort(1, maps:to_list(Index1)). | |
index_lines([], _LineNumber, Index) -> | |
Index; | |
index_lines([Line|Lines], LineNumber, Index) -> | |
Words = words_in(Line), | |
Index1 = index_words(Words, LineNumber, Index), | |
index_lines(Lines, LineNumber+1, Index1). | |
words_in(Line) -> | |
string:tokens(Line, " \t\n.,\\"). | |
index_words([], _LineNumber, Index) -> | |
Index; | |
index_words([Word|Words], LineNumber, Index) -> | |
Index1 = index_word(Word, LineNumber, Index), | |
index_words(Words, LineNumber, Index1). | |
index_word(Word, LineNumber, Index) -> | |
Word1 = normalize(Word), | |
case should_index_word(Word1) of | |
true -> maps:update_with(Word1, | |
fun(Lines) -> [LineNumber|Lines] end, | |
[LineNumber], | |
Index); | |
_ -> Index | |
end. | |
should_index_word(Word) -> | |
(not is_short_word(Word)) andalso (not is_common_word(Word)). | |
is_short_word(Word) -> | |
string:len(Word) < 3. | |
is_common_word(Word) -> | |
lists:member(Word, common_words()). | |
% List of top 100 common words according to | |
% http://www.duboislc.org/ED-Watch/Words/1-100.html | |
common_words() -> | |
["the", "of", "and", "a", "to", "in", | |
"is", "you", "that", "it", "he", "was", | |
"for", "on", "are", "as", "with", "his", | |
"they", "I", "at", "be", "this", "have", | |
"from", "or", "one", "had", "by", "word", | |
"but", "not", "what", "all", "were", "we", | |
"when", "your", "can", "said", "there", | |
"use", "an", "each", "which", "she", "do", | |
"how", "their", "if", "will", "up", "other", | |
"about", "out", "many", "then", "them", | |
"these", "so", "some", "her", "would", "make", | |
"like", "him", "into", "time", "has", "look", | |
"two", "more", "write", "go", "see", "number", | |
"no", "way", "could", "people", "my", "than", | |
"first", "water", "been", "call", "who", | |
"oil", "its", "now", "find", "long", "down", | |
"day", "did", "get", "come", "made", "may", "part"]. | |
normalize(Word) -> | |
Word1 = string:to_lower(Word), | |
dumb_lemmatizer(Word1). | |
% Very, Very, Very dumb (and generally incorrect english) | |
dumb_lemmatizer(Word) -> | |
Word1 = re:replace(Word, "ed$", "e", [{return, list}]), | |
re:replace(Word1, "es$", "e", [{return, list}]). | |
group_line_numbers(LineNumbers) -> | |
group_line_numbers(LineNumbers, []). | |
% Walk through a descending list of line numbers | |
% and group them in to consecutive runs. | |
% | |
% By walking backwards though the list and building it up | |
% we then result in an ascending list of page number | |
% grouping tuples. | |
group_line_numbers([], Groupings) -> | |
Groupings; | |
group_line_numbers([Line|Lines], []) -> | |
group_line_numbers(Lines, [{Line,Line}]); | |
group_line_numbers([Line|Lines], [{Start, End}|Groupings]) -> | |
case Line =:= (Start - 1) of | |
true -> group_line_numbers(Lines, [{Line, End}|Groupings]); | |
_ -> group_line_numbers(Lines, [{Line, Line}, {Start, End} | Groupings]) | |
end. | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
get_file_contents(Name) -> | |
{ok,File} = file:open(Name,[read]), | |
Rev = get_all_lines(File,[]), | |
lists:reverse(Rev). | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
get_all_lines(File,Partial) -> | |
case io:get_line(File,"") of | |
eof -> file:close(File), | |
Partial; | |
Line -> {Strip,_} = lists:split(length(Line)-1,Line), | |
get_all_lines(File,[Strip|Partial]) | |
end. | |
% Show the contents of a list of strings. | |
% Can be used to check the results of calling get_file_contents. | |
show_file_contents([L|Ls]) -> | |
io:format("~s~n",[L]), | |
show_file_contents(Ls); | |
show_file_contents([]) -> | |
ok. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment