Skip to content

Instantly share code, notes, and snippets.

@kuenishi
Last active January 1, 2016 20:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kuenishi/8199663 to your computer and use it in GitHub Desktop.
Save kuenishi/8199663 to your computer and use it in GitHub Desktop.
Find exactly duplicated files/directories. $ ocamlopt -o findup unix.cmxa findup.ml
module M = Map.Make (
struct
type t = string
let compare = Pervasives.compare
end)
exception Bad_arg
module LimitQ =
struct
type 'a queue = ((int * 'a) list) * int (* limit *)
let empty limit =
if limit > 0 then ([], limit) else raise Bad_arg
let insert queue prio elt =
let list,limit = queue in
let rec ins rest = function
| [] -> List.rev_append rest [(prio,elt)];
| (p,e)::tl when prio < p ->
List.rev_append rest ((prio,elt)::(p,e)::tl);
| (p,e)::tl when prio >= p -> ins ((p,e)::rest) tl;
| _::_ -> raise Bad_arg
in
let list' = ins [] list in
if (List.length list') > limit then (List.tl list'),limit
else list',limit
exception Queue_is_empty
let pop = function
| [],_ -> raise Queue_is_empty
| hd::tl,limit -> (hd, (tl,limit))
end
type node = {
size: int;
checksum: string;
name: string;
}
let maybe_add s md5 path m =
let l = try M.find md5 m with Not_found -> [] in
let node = {size=s; checksum=md5; name=path} in
(s, md5, (M.add md5 (node::l) m));;
let rec search_dup_fold m path =
let dirs = Sys.readdir path in
let rec fold total_size acc_md5 acc_m = function
| [] -> total_size, acc_md5, acc_m;
| hd::tl ->
begin
let filename = Filename.concat path hd in
let s, md5, m2 = handle_name acc_m filename in
let md5' = Digest.to_hex (Digest.string (md5 ^ acc_md5)) in
fold (total_size + s) (md5') m2 tl
end
in
let s,md5,m2 = fold 0 "empty directory!!" m (Array.to_list dirs) in
(* Printf.printf "directory: %s (%s, %d)\n" path md5 s; *)
maybe_add s md5 path m2
and handle_name m path =
try
if Sys.is_directory path then
search_dup_fold m path
else
(* TODO: calculating md5 is just heavy *)
let md5 = Digest.to_hex (Digest.file path) in
let stat = Unix.stat path in
let size = stat.Unix.st_size in
(* Printf.printf "file: %s (%s, %d)\n" path md5 size; *)
maybe_add size md5 path m
with
Sys_error str ->
Printf.eprintf ">> %s\n" str;
(0, "", m)
let search_dup path =
Printf.printf "searching for dup files in %s\n" path;
let (_, _, t) = handle_name M.empty path in
t;;
let print_tree t =
(* List.map (fun s -> print_endline s) t;; *)
M.iter (fun k nodes ->
Printf.printf "\n%s: " k;
List.iter (fun n -> Printf.printf "(%s %d)," n.name n.size) nodes)
t
let sort_tree_by_size tree =
let rec sort k nodes q =
if List.length nodes > 1 then
let n = List.hd nodes in
let size = n.size * ((List.length nodes) - 1) in
LimitQ.insert q size nodes
else q
in
M.fold sort tree (LimitQ.empty 64);;
let pp_int i = match i with
| _ when i > 1000000 -> (string_of_int (i / 1000)) ^ " KB";
| _ when i > 1000 -> (string_of_int i) ^ " B";
| i -> (string_of_int i) ^ " B ";;
let rec print_list q =
try
let (p,nodes),nextq = LimitQ.pop q in
let n = List.hd nodes in
Printf.printf "%s (%s):\n" (String.sub n.checksum 0 9) (pp_int p);
List.iter (fun n -> Printf.printf "\t%s\n" n.name) nodes;
print_list nextq
with LimitQ.Queue_is_empty -> ()
let main () =
let path =
if Array.length Sys.argv = 1 then "."
else Sys.argv.(1)
in
let tree = search_dup path in
(* print_tree tree; *)
print_endline "";
print_list (sort_tree_by_size tree);;
let _ =
main ();;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment