Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save redink/350e51837917d5ba3c7edc25aeb5c88e to your computer and use it in GitHub Desktop.
Save redink/350e51837917d5ba3c7edc25aeb5c88e to your computer and use it in GitHub Desktop.
-module(find_top_100_from_1gb_file_1mb_memory_limit).
-compile(export_all).
start() ->
erlang:process_flag(max_heap_size, #{size => 131072, kill => false, error_logger => true}),
{ok, IO} = file:open("test_file.file", [read, binary, read_ahead]),
read_file(file:read_line(IO), IO),
map_count_single_file(),
find_topn().
read_file(eof, IO) ->
file:close(IO),
ok;
read_file({ok, Data0}, IO) ->
[Data, _] = binary:split(Data0, <<"\n">>),
Index = erlang:phash(Data, 5000),
file:write_file("tmp_file_hash_" ++ erlang:integer_to_list(Index) ++ ".tmp1",
[<<Data/binary, "\n">>], [append]),
read_file(file:read_line(IO), IO).
map_count_single_file() ->
map_count_single_file_do(filelib:wildcard("./*.tmp1")).
map_count_single_file_do([]) ->
ok;
map_count_single_file_do([File | Tail]) ->
ok = count_single_file(File),
map_count_single_file_do(Tail).
count_single_file(File) ->
{ok, Bin} = file:read_file(File),
Dict = dict:new(),
List = [W || W <- binary:split(Bin, <<"\n">>, [global]), W =/= <<>>],
NewDict = lists:foldl(fun(H, D) -> dict:update_counter(H, 1, D) end, Dict, List),
write_tmp2_file(dict:to_list(NewDict), File ++ ".tmp2"),
ok.
write_tmp2_file([], _) ->
ok;
write_tmp2_file([{K, V} | Tail], File) ->
file:write_file(File, [K, <<"|">>, erlang:integer_to_binary(V), <<"\n">>], [append]),
write_tmp2_file(Tail, File).
find_topn() ->
find_topn(100).
find_topn(N) ->
GBT = gb_trees:empty(),
NewGBT = find_topn_map_single_file(filelib:wildcard("./*.tmp1.tmp2"), GBT, N),
[{K, V} || {{V, K}, _} <- gb_trees:to_list(NewGBT)].
find_topn_map_single_file([], GBT, _) ->
GBT;
find_topn_map_single_file([File | Tail], GBT, N) ->
{ok, IO} = file:open(File, [read, binary, read_ahead]),
NewGBT = find_topn_map_single_file_do(file:read_line(IO), IO, GBT, N),
find_topn_map_single_file(Tail, NewGBT, N).
find_topn_map_single_file_do(eof, IO, GBT, _) ->
file:close(IO),
GBT;
find_topn_map_single_file_do({ok, Data0}, IO, GBT0, N) ->
[K0, V | _] = binary:split(Data0, [<<"|">>, <<"\n">>], [global]),
K = {erlang:binary_to_integer(V), K0},
GBT1 =
case gb_trees:size(GBT0) < N of
true ->
gb_trees:insert(K, 0, GBT0);
false ->
{Smallest, _} = gb_trees:smallest(GBT0),
case Smallest < K of
true ->
Tmp = gb_trees:delete(Smallest, GBT0),
gb_trees:insert(K, 0, Tmp);
_ ->
GBT0
end
end,
find_topn_map_single_file_do(file:read_line(IO), IO, GBT1, N).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment