stevenproctor/index.erl

## index.erl
-module(index).
-export([get_file_contents/1,
         show_file_contents/1,
         main/1]).


% Used to read a file into a list of lines.
% Example files available in:
%   gettysburg-address.txt (short)
%   dickens-christmas.txt  (long)

main(Name) ->
    Contents = get_file_contents(Name),
    Index = index_lines(Contents, 1, maps:new()),
    Index1 = maps:fold(fun(K, V, Accum) -> maps:put(K, group_line_numbers(V), Accum) end,
                       maps:new(),
                       Index),
    lists:keysort(1, maps:to_list(Index1)).


index_lines([], _LineNumber, Index) ->
    Index;
index_lines([Line|Lines], LineNumber, Index) ->
    Words = words_in(Line),
    Index1 = index_words(Words, LineNumber, Index),
    index_lines(Lines, LineNumber+1, Index1).

words_in(Line) ->
    string:tokens(Line, " \t\n.,\\").


index_words([], _LineNumber, Index) ->
    Index;
index_words([Word|Words], LineNumber, Index) ->
    Index1 = index_word(Word, LineNumber, Index),
    index_words(Words, LineNumber, Index1).

index_word(Word, LineNumber, Index) ->
    Word1 = normalize(Word),
    case should_index_word(Word1) of
        true -> maps:update_with(Word1,
                                 fun(Lines) -> [LineNumber|Lines] end,
                                 [LineNumber],
                                 Index);
        _ -> Index
    end.


should_index_word(Word) ->
    (not is_short_word(Word)) andalso (not is_common_word(Word)).

is_short_word(Word) ->
    string:len(Word) < 3.

is_common_word(Word) ->
    lists:member(Word, common_words()).

% List of top 100 common words according to
% http://www.duboislc.org/ED-Watch/Words/1-100.html
common_words() ->
    ["the", "of", "and", "a", "to", "in",
    "is", "you", "that", "it", "he", "was",
    "for", "on", "are", "as", "with", "his",
    "they", "I", "at", "be", "this", "have",
    "from", "or", "one", "had", "by", "word",
    "but", "not", "what", "all", "were", "we",
    "when", "your", "can", "said", "there",
    "use", "an", "each", "which", "she", "do",
    "how", "their", "if", "will", "up", "other",
    "about", "out", "many", "then", "them",
    "these", "so", "some", "her", "would", "make",
    "like", "him", "into", "time", "has", "look",
    "two", "more", "write", "go", "see", "number",
    "no", "way", "could", "people", "my", "than",
    "first", "water", "been", "call", "who",
    "oil", "its", "now", "find", "long", "down",
    "day", "did", "get", "come", "made", "may", "part"].


normalize(Word) ->
    Word1 = string:to_lower(Word),
    dumb_lemmatizer(Word1).

% Very, Very, Very dumb (and generally incorrect english)
dumb_lemmatizer(Word) ->
    Word1 = re:replace(Word, "ed$", "e", [{return, list}]),
    re:replace(Word1, "es$", "e", [{return, list}]).


group_line_numbers(LineNumbers) ->
    group_line_numbers(LineNumbers, []).

% Walk through a descending list of line numbers
% and group them in to consecutive runs.
%
% By walking backwards though the list and building it up
% we then result in an ascending list of page number
% grouping tuples.
group_line_numbers([], Groupings) ->
    Groupings;
group_line_numbers([Line|Lines], []) ->
    group_line_numbers(Lines, [{Line,Line}]);
group_line_numbers([Line|Lines], [{Start, End}|Groupings]) ->
    case Line =:= (Start - 1) of
        true -> group_line_numbers(Lines, [{Line, End}|Groupings]);
        _ -> group_line_numbers(Lines, [{Line, Line}, {Start, End} | Groupings])
    end.

% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.

get_file_contents(Name) ->
    {ok,File} = file:open(Name,[read]),
    Rev = get_all_lines(File,[]),
lists:reverse(Rev).

% Auxiliary function for get_file_contents.
% Not exported.

get_all_lines(File,Partial) ->
    case io:get_line(File,"") of
        eof -> file:close(File),
               Partial;
        Line -> {Strip,_} = lists:split(length(Line)-1,Line),
                get_all_lines(File,[Strip|Partial])
    end.

% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.

show_file_contents([L|Ls]) ->
    io:format("~s~n",[L]),
    show_file_contents(Ls);
 show_file_contents([]) ->
    ok.
	-module(index).
	-export([get_file_contents/1,
	show_file_contents/1,
	main/1]).


	% Used to read a file into a list of lines.
	% Example files available in:
	% gettysburg-address.txt (short)
	% dickens-christmas.txt (long)

	main(Name) ->
	Contents = get_file_contents(Name),
	Index = index_lines(Contents, 1, maps:new()),
	Index1 = maps:fold(fun(K, V, Accum) -> maps:put(K, group_line_numbers(V), Accum) end,
	maps:new(),
	Index),
	lists:keysort(1, maps:to_list(Index1)).


	index_lines([], _LineNumber, Index) ->
	Index;
	index_lines([Line\|Lines], LineNumber, Index) ->
	Words = words_in(Line),
	Index1 = index_words(Words, LineNumber, Index),
	index_lines(Lines, LineNumber+1, Index1).

	words_in(Line) ->
	string:tokens(Line, " \t\n.,\\").


	index_words([], _LineNumber, Index) ->
	Index;
	index_words([Word\|Words], LineNumber, Index) ->
	Index1 = index_word(Word, LineNumber, Index),
	index_words(Words, LineNumber, Index1).

	index_word(Word, LineNumber, Index) ->
	Word1 = normalize(Word),
	case should_index_word(Word1) of
	true -> maps:update_with(Word1,
	fun(Lines) -> [LineNumber\|Lines] end,
	[LineNumber],
	Index);
	_ -> Index
	end.


	should_index_word(Word) ->
	(not is_short_word(Word)) andalso (not is_common_word(Word)).

	is_short_word(Word) ->
	string:len(Word) < 3.

	is_common_word(Word) ->
	lists:member(Word, common_words()).

	% List of top 100 common words according to
	% http://www.duboislc.org/ED-Watch/Words/1-100.html
	common_words() ->
	["the", "of", "and", "a", "to", "in",
	"is", "you", "that", "it", "he", "was",
	"for", "on", "are", "as", "with", "his",
	"they", "I", "at", "be", "this", "have",
	"from", "or", "one", "had", "by", "word",
	"but", "not", "what", "all", "were", "we",
	"when", "your", "can", "said", "there",
	"use", "an", "each", "which", "she", "do",
	"how", "their", "if", "will", "up", "other",
	"about", "out", "many", "then", "them",
	"these", "so", "some", "her", "would", "make",
	"like", "him", "into", "time", "has", "look",
	"two", "more", "write", "go", "see", "number",
	"no", "way", "could", "people", "my", "than",
	"first", "water", "been", "call", "who",
	"oil", "its", "now", "find", "long", "down",
	"day", "did", "get", "come", "made", "may", "part"].


	normalize(Word) ->
	Word1 = string:to_lower(Word),
	dumb_lemmatizer(Word1).

	% Very, Very, Very dumb (and generally incorrect english)
	dumb_lemmatizer(Word) ->
	Word1 = re:replace(Word, "ed$", "e", [{return, list}]),
	re:replace(Word1, "es$", "e", [{return, list}]).



	group_line_numbers(LineNumbers) ->
	group_line_numbers(LineNumbers, []).

	% Walk through a descending list of line numbers
	% and group them in to consecutive runs.
	%
	% By walking backwards though the list and building it up
	% we then result in an ascending list of page number
	% grouping tuples.
	group_line_numbers([], Groupings) ->
	Groupings;
	group_line_numbers([Line\|Lines], []) ->
	group_line_numbers(Lines, [{Line,Line}]);
	group_line_numbers([Line\|Lines], [{Start, End}\|Groupings]) ->
	case Line =:= (Start - 1) of
	true -> group_line_numbers(Lines, [{Line, End}\|Groupings]);
	_ -> group_line_numbers(Lines, [{Line, Line}, {Start, End} \| Groupings])
	end.

	% Get the contents of a text file into a list of lines.
	% Each line has its trailing newline removed.

	get_file_contents(Name) ->
	{ok,File} = file:open(Name,[read]),
	Rev = get_all_lines(File,[]),
	lists:reverse(Rev).

	% Auxiliary function for get_file_contents.
	% Not exported.

	get_all_lines(File,Partial) ->
	case io:get_line(File,"") of
	eof -> file:close(File),
	Partial;
	Line -> {Strip,_} = lists:split(length(Line)-1,Line),
	get_all_lines(File,[Strip\|Partial])
	end.

	% Show the contents of a list of strings.
	% Can be used to check the results of calling get_file_contents.

	show_file_contents([L\|Ls]) ->
	io:format("~s~n",[L]),
	show_file_contents(Ls);
	show_file_contents([]) ->
	ok.