Skip to content

Instantly share code, notes, and snippets.

@kylethebaker
Last active July 10, 2017 06:18
Show Gist options
  • Save kylethebaker/85b2f10f210b3760ab9e5be7d249f8dd to your computer and use it in GitHub Desktop.
Save kylethebaker/85b2f10f210b3760ab9e5be7d249f8dd to your computer and use it in GitHub Desktop.
Functional Erlang Week 2 Assignment
-module(assignment2).
-export([print_indexes/1]).
-export([get_indexes/1, get_top_n_words/2, get_alpha_indexes/1]).
-define(SPLIT_TOKENS, " .,'\"()[]{}!?-\\").
% Gets all of the indexes.
get_indexes(Filename) ->
maps:to_list(get_indexes_ranges(Filename)).
% Gets the top N words from the file based on the number of lines they appear
% on. The result is in the format {Word, Lines}.
get_top_n_words(Filename, N) ->
Indexes = get_indexes_list(Filename),
Sorted = sort_indexes_count(Indexes),
{Top10, _} = lists:split(N, Sorted),
lists:map(fun({Word, L}) -> {Word, length(L)} end, Top10).
% Gets a list of indexes sorted alphabetically.
get_alpha_indexes(Filename) ->
Indexes = get_indexes_list(Filename),
Sorted = sort_indexes_alpha(Indexes),
Sorted.
% Gets all word indexes from a file, with the line numbers included
% as range tuples: e.g [{1, 3}, {6, 6}, {8, 9}]
get_indexes_ranges(Filename) ->
Indexes = get_indexes_list(Filename),
convert_to_ranges(Indexes).
% Gets all word indexes from a file, with the line numbers included
% sequentially in a list: e.g [1, 2, 3, 6, 8, 9]
get_indexes_list(Filename) ->
Lines = get_file_contents(Filename),
{_, Result} = lists:foldl(fun get_line_indexes/2, {1, maps:new()}, Lines),
Result.
% Sorts Indexes by alphabetical order
sort_indexes_alpha(Indexes) ->
IndexesList = maps:to_list(Indexes),
lists:keysort(1, IndexesList).
% Sorts Indexes by number of lines a word occurs on (must use list indexes
% instead of range tuple indexes)
sort_indexes_count(Indexes) ->
List = maps:to_list(Indexes),
Counts = lists:map(fun({W, I}) -> {W, I, length(I)} end, List),
Sorted = lists:keysort(3, Counts),
NoCounts = lists:map(fun({W, I, _}) -> {W, I} end, Sorted),
lists:reverse(NoCounts).
% Records all indexes for words on Line number N. Indexes are stored in a Map
% which has the Word as the key and an ordered set of line numbers it appears
% on as the value.
get_line_indexes(Line, {N, Map}) ->
Words = get_words(Line),
Map2 = update_word_indexes(Words, N, Map),
{N + 1, Map2}.
% Gets a list of words from the line, with non-alphanumeric characters
% removed and words normalized to lowercase
get_words(Line) ->
Words = string:tokens(Line, ?SPLIT_TOKENS),
lists:map(fun string:lowercase/1, Words).
% Record into Map that each word from Words exists on line N.
% Uses an ordered set so that line numbers will be sorted without duplicates
update_word_indexes([], _, Map) -> Map;
update_word_indexes([Word | Words], N, Map) ->
Indexes = maps:get(Word, Map, ordsets:new()),
Indexes2 = ordsets:add_element(N, Indexes),
Map2 = maps:put(Word, Indexes2, Map),
update_word_indexes(Words, N, Map2).
% Converts the index for each word in the Map from sequences to ranges.
convert_to_ranges(Indexes) ->
Fn = fun (_, V) -> list_to_ranges(V) end,
maps:map(Fn, Indexes).
% Converts a sorted list of numbers to a list of tuples containing ranges.
% Example: [1, 2, 3, 6, 9, 10, 12] becomes [{1, 3}, {6, 6}, {9, 10}, {12, 12}]
list_to_ranges(List) -> list_to_ranges(List, []).
list_to_ranges([], Ranges) -> lists:reverse(Ranges);
list_to_ranges(L, Ranges) ->
{Range, L2} = next_range_sequence(L),
list_to_ranges(L2, [Range | Ranges]).
% Finds the next sequential range range in the list and returns the remaining
% list and the sequence tuple. Lists must be sorted and not contain duplicates
% Example: [1, 2, 3, 5, 6, 8] returns {{1, 3}, [5, 6, 8]}
next_range_sequence([X | Xs]) ->
next_range_sequence(Xs, {X, X}).
next_range_sequence([X | Xs], {Lo, Hi}) when X == Hi + 1 ->
next_range_sequence(Xs, {Lo, X});
next_range_sequence(Xs, Range) -> {Range, Xs}.
% Pretty print all of the word indexes. Allow for both Maps and lists
% to be printed.
print_indexes(Indexes) when not is_list(Indexes) ->
print_indexes(maps:to_list(Indexes));
print_indexes(Indexes) ->
lists:foreach(fun print_word_indexes/1, Indexes),
ok.
% Pretty print a single word and indexes
print_word_indexes({Word, Indexes}) ->
io:format("~s -> ", [Word]),
io:format("~w~n", [ordsets:to_list(Indexes)]),
ok.
% Gets list of lines from a file
get_file_contents(Name) ->
{ok, File} = file:open(Name, [read]),
lists:reverse(get_all_lines(File)).
% Get all lines from a file
get_all_lines(File) -> get_all_lines(File, []).
get_all_lines(File, Lines) ->
case io:get_line(File, "") of
eof -> file:close(File),
Lines;
Line -> get_all_lines(File, [lists:droplast(Line) | Lines])
end.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment