Skip to content

Instantly share code, notes, and snippets.

@pmech

pmech/gist:1105252

Created Jul 25, 2011
Embed
What would you like to do?
Erlang script for finding duplicate files.
#! /usr/bin/escript
-define(BLOCK_SIZE, 128*1024).
main([]) -> main(["."]);
main(Dirs) ->
application:start(crypto),
do(Dirs).
do(Dirs) -> print_dups(duplicate_files(Dirs)).
duplicate_files(Directories) ->
[Val || {_, V} = Val <- dict:to_list(pfile_hashes(Directories)), length(V) > 1].
pfile_hashes(Directories) ->
HashDicts = pmap(fun(D) -> file_hashes(D) end,
Directories),
lists:foldl(fun (HashDict, FinalDict) ->
dict:merge(fun (_, Value1, Value2) ->
Value1 ++ Value2
end, FinalDict, HashDict)
end,
dict:new(),
HashDicts).
file_hashes(Dir) ->
filelib:fold_files(Dir, ".*", true,
fun(FileName, Dict) ->
dict:append(sha_hash_file(FileName), FileName, Dict)
end,
dict:new()).
sha_hash_file(FileName) ->
{ok, Device} = file:open(FileName, [read, raw, binary]),
SHA1 = sha_hash_device(Device, crypto:sha_init()),
file:close(Device),
SHA1.
sha_hash_device(Device, SHA1) ->
case file:read(Device, ?BLOCK_SIZE) of
{ok, Data} ->
sha_hash_device(Device, crypto:sha_update(SHA1, Data));
eof ->
crypto:sha_final(SHA1)
end.
print_dups(Dups) ->
lists:foreach(fun({_, Names}) ->
io:format("~b duplicates\n", [length(Names)]),
print_names(Names)
end,
Dups).
print_names(Names) ->
lists:foreach(fun(Name) -> io:format(" ~s\n", [Name]) end, lists:sort(Names)).
% parallel map implementation (copied from web, TODO: check where from)
pmap(F, L) ->
await(spawn_jobs(F, L)).
spawn_jobs(F, L) ->
Parent = self(),
[spawn(fun() -> Parent ! {self(), catch {ok, F(X)}} end) || X <- L].
await([]) -> [];
await([H|T]) ->
receive
{H, {ok, Res}} ->
[Res | await(T)];
{H, {'EXIT',_} = Err} ->
[exit(Pid, kill) || Pid <- T],
[receive {P, _} -> ok after 0 -> ok end || P <- T],
erlang:error(Err)
end.
% vim:ft=erlang
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.