kylethebaker/assignment2.erl

## assignment2.erl
-module(assignment2).

-export([print_indexes/1]).
-export([get_indexes/1, get_top_n_words/2, get_alpha_indexes/1]).

-define(SPLIT_TOKENS, " .,'\"()[]{}!?-\\").

% Gets all of the indexes.
get_indexes(Filename) ->
  maps:to_list(get_indexes_ranges(Filename)).

% Gets the top N words from the file based on the number of lines they appear
% on. The result is in the format {Word, Lines}.
get_top_n_words(Filename, N) ->
  Indexes = get_indexes_list(Filename),
  Sorted = sort_indexes_count(Indexes),
  {Top10, _} = lists:split(N, Sorted),
  lists:map(fun({Word, L}) -> {Word, length(L)} end, Top10).

% Gets a list of indexes sorted alphabetically.
get_alpha_indexes(Filename) ->
  Indexes = get_indexes_list(Filename),
  Sorted = sort_indexes_alpha(Indexes),
  Sorted.

% Gets all word indexes from a file, with the line numbers included
% as range tuples: e.g [{1, 3}, {6, 6}, {8, 9}]
get_indexes_ranges(Filename) ->
  Indexes = get_indexes_list(Filename),
  convert_to_ranges(Indexes).

% Gets all word indexes from a file, with the line numbers included
% sequentially in a list: e.g [1, 2, 3, 6, 8, 9]
get_indexes_list(Filename) ->
  Lines = get_file_contents(Filename),
  {_, Result} = lists:foldl(fun get_line_indexes/2, {1, maps:new()}, Lines),
  Result.

% Sorts Indexes by alphabetical order
sort_indexes_alpha(Indexes) ->
  IndexesList = maps:to_list(Indexes),
  lists:keysort(1, IndexesList).

% Sorts Indexes by number of lines a word occurs on (must use list indexes
% instead of range tuple indexes)
sort_indexes_count(Indexes) ->
  List = maps:to_list(Indexes),
  Counts = lists:map(fun({W, I}) -> {W, I, length(I)} end, List),
  Sorted = lists:keysort(3, Counts),
  NoCounts = lists:map(fun({W, I, _}) -> {W, I} end, Sorted),
  lists:reverse(NoCounts).

% Records all indexes for words on Line number N. Indexes are stored in a Map
% which has the Word as the key and an ordered set of line numbers it appears
% on as the value.
get_line_indexes(Line, {N, Map}) ->
  Words = get_words(Line),
  Map2 = update_word_indexes(Words, N, Map),
  {N + 1, Map2}.

% Gets a list of words from the line, with non-alphanumeric characters
% removed and words normalized to lowercase
get_words(Line) ->
  Words = string:tokens(Line, ?SPLIT_TOKENS),
  lists:map(fun string:lowercase/1, Words).

% Record into Map that each word from Words exists on line N.
% Uses an ordered set so that line numbers will be sorted without duplicates
update_word_indexes([], _, Map) -> Map;
update_word_indexes([Word | Words], N, Map) ->
  Indexes = maps:get(Word, Map, ordsets:new()),
  Indexes2 = ordsets:add_element(N, Indexes),
  Map2 = maps:put(Word, Indexes2, Map),
  update_word_indexes(Words, N, Map2).

% Converts the index for each word in the Map from sequences to ranges.
convert_to_ranges(Indexes) ->
  Fn = fun (_, V) -> list_to_ranges(V) end,
  maps:map(Fn, Indexes).

% Converts a sorted list of numbers to a list of tuples containing ranges.
% Example: [1, 2, 3, 6, 9, 10, 12] becomes [{1, 3}, {6, 6}, {9, 10}, {12, 12}]
list_to_ranges(List) -> list_to_ranges(List, []).
list_to_ranges([], Ranges) -> lists:reverse(Ranges);
list_to_ranges(L, Ranges) ->
  {Range, L2} = next_range_sequence(L),
  list_to_ranges(L2, [Range | Ranges]).

% Finds the next sequential range range in the list and returns the remaining
% list and the sequence tuple. Lists must be sorted and not contain duplicates
% Example: [1, 2, 3, 5, 6, 8] returns {{1, 3}, [5, 6, 8]}
next_range_sequence([X | Xs]) ->
  next_range_sequence(Xs, {X, X}).
next_range_sequence([X | Xs], {Lo, Hi}) when X == Hi + 1 ->
  next_range_sequence(Xs, {Lo, X});
next_range_sequence(Xs, Range) -> {Range, Xs}.

% Pretty print all of the word indexes. Allow for both Maps and lists
% to be printed.
print_indexes(Indexes) when not is_list(Indexes) ->
  print_indexes(maps:to_list(Indexes));
print_indexes(Indexes) ->
  lists:foreach(fun print_word_indexes/1, Indexes),
  ok.

% Pretty print a single word and indexes
print_word_indexes({Word, Indexes}) ->
  io:format("~s -> ", [Word]),
  io:format("~w~n", [ordsets:to_list(Indexes)]),
  ok.

% Gets list of lines from a file
get_file_contents(Name) ->
  {ok, File} = file:open(Name, [read]),
  lists:reverse(get_all_lines(File)).

% Get all lines from a file
get_all_lines(File) -> get_all_lines(File, []).
get_all_lines(File, Lines) ->
  case io:get_line(File, "") of
    eof -> file:close(File),
           Lines;
    Line -> get_all_lines(File, [lists:droplast(Line) | Lines])
  end.
	-module(assignment2).

	-export([print_indexes/1]).
	-export([get_indexes/1, get_top_n_words/2, get_alpha_indexes/1]).

	-define(SPLIT_TOKENS, " .,'\"()[]{}!?-\\").

	% Gets all of the indexes.
	get_indexes(Filename) ->
	maps:to_list(get_indexes_ranges(Filename)).

	% Gets the top N words from the file based on the number of lines they appear
	% on. The result is in the format {Word, Lines}.
	get_top_n_words(Filename, N) ->
	Indexes = get_indexes_list(Filename),
	Sorted = sort_indexes_count(Indexes),
	{Top10, _} = lists:split(N, Sorted),
	lists:map(fun({Word, L}) -> {Word, length(L)} end, Top10).

	% Gets a list of indexes sorted alphabetically.
	get_alpha_indexes(Filename) ->
	Indexes = get_indexes_list(Filename),
	Sorted = sort_indexes_alpha(Indexes),
	Sorted.

	% Gets all word indexes from a file, with the line numbers included
	% as range tuples: e.g [{1, 3}, {6, 6}, {8, 9}]
	get_indexes_ranges(Filename) ->
	Indexes = get_indexes_list(Filename),
	convert_to_ranges(Indexes).

	% Gets all word indexes from a file, with the line numbers included
	% sequentially in a list: e.g [1, 2, 3, 6, 8, 9]
	get_indexes_list(Filename) ->
	Lines = get_file_contents(Filename),
	{_, Result} = lists:foldl(fun get_line_indexes/2, {1, maps:new()}, Lines),
	Result.

	% Sorts Indexes by alphabetical order
	sort_indexes_alpha(Indexes) ->
	IndexesList = maps:to_list(Indexes),
	lists:keysort(1, IndexesList).

	% Sorts Indexes by number of lines a word occurs on (must use list indexes
	% instead of range tuple indexes)
	sort_indexes_count(Indexes) ->
	List = maps:to_list(Indexes),
	Counts = lists:map(fun({W, I}) -> {W, I, length(I)} end, List),
	Sorted = lists:keysort(3, Counts),
	NoCounts = lists:map(fun({W, I, _}) -> {W, I} end, Sorted),
	lists:reverse(NoCounts).

	% Records all indexes for words on Line number N. Indexes are stored in a Map
	% which has the Word as the key and an ordered set of line numbers it appears
	% on as the value.
	get_line_indexes(Line, {N, Map}) ->
	Words = get_words(Line),
	Map2 = update_word_indexes(Words, N, Map),
	{N + 1, Map2}.

	% Gets a list of words from the line, with non-alphanumeric characters
	% removed and words normalized to lowercase
	get_words(Line) ->
	Words = string:tokens(Line, ?SPLIT_TOKENS),
	lists:map(fun string:lowercase/1, Words).

	% Record into Map that each word from Words exists on line N.
	% Uses an ordered set so that line numbers will be sorted without duplicates
	update_word_indexes([], _, Map) -> Map;
	update_word_indexes([Word \| Words], N, Map) ->
	Indexes = maps:get(Word, Map, ordsets:new()),
	Indexes2 = ordsets:add_element(N, Indexes),
	Map2 = maps:put(Word, Indexes2, Map),
	update_word_indexes(Words, N, Map2).

	% Converts the index for each word in the Map from sequences to ranges.
	convert_to_ranges(Indexes) ->
	Fn = fun (_, V) -> list_to_ranges(V) end,
	maps:map(Fn, Indexes).

	% Converts a sorted list of numbers to a list of tuples containing ranges.
	% Example: [1, 2, 3, 6, 9, 10, 12] becomes [{1, 3}, {6, 6}, {9, 10}, {12, 12}]
	list_to_ranges(List) -> list_to_ranges(List, []).
	list_to_ranges([], Ranges) -> lists:reverse(Ranges);
	list_to_ranges(L, Ranges) ->
	{Range, L2} = next_range_sequence(L),
	list_to_ranges(L2, [Range \| Ranges]).

	% Finds the next sequential range range in the list and returns the remaining
	% list and the sequence tuple. Lists must be sorted and not contain duplicates
	% Example: [1, 2, 3, 5, 6, 8] returns {{1, 3}, [5, 6, 8]}
	next_range_sequence([X \| Xs]) ->
	next_range_sequence(Xs, {X, X}).
	next_range_sequence([X \| Xs], {Lo, Hi}) when X == Hi + 1 ->
	next_range_sequence(Xs, {Lo, X});
	next_range_sequence(Xs, Range) -> {Range, Xs}.

	% Pretty print all of the word indexes. Allow for both Maps and lists
	% to be printed.
	print_indexes(Indexes) when not is_list(Indexes) ->
	print_indexes(maps:to_list(Indexes));
	print_indexes(Indexes) ->
	lists:foreach(fun print_word_indexes/1, Indexes),
	ok.

	% Pretty print a single word and indexes
	print_word_indexes({Word, Indexes}) ->
	io:format("~s -> ", [Word]),
	io:format("~w~n", [ordsets:to_list(Indexes)]),
	ok.

	% Gets list of lines from a file
	get_file_contents(Name) ->
	{ok, File} = file:open(Name, [read]),
	lists:reverse(get_all_lines(File)).

	% Get all lines from a file
	get_all_lines(File) -> get_all_lines(File, []).
	get_all_lines(File, Lines) ->
	case io:get_line(File, "") of
	eof -> file:close(File),
	Lines;
	Line -> get_all_lines(File, [lists:droplast(Line) \| Lines])
	end.