Last active
January 1, 2016 20:49
-
-
Save kuenishi/8199663 to your computer and use it in GitHub Desktop.
Find exactly duplicated files/directories.
$ ocamlopt -o findup unix.cmxa findup.ml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module M = Map.Make ( | |
struct | |
type t = string | |
let compare = Pervasives.compare | |
end) | |
exception Bad_arg | |
module LimitQ = | |
struct | |
type 'a queue = ((int * 'a) list) * int (* limit *) | |
let empty limit = | |
if limit > 0 then ([], limit) else raise Bad_arg | |
let insert queue prio elt = | |
let list,limit = queue in | |
let rec ins rest = function | |
| [] -> List.rev_append rest [(prio,elt)]; | |
| (p,e)::tl when prio < p -> | |
List.rev_append rest ((prio,elt)::(p,e)::tl); | |
| (p,e)::tl when prio >= p -> ins ((p,e)::rest) tl; | |
| _::_ -> raise Bad_arg | |
in | |
let list' = ins [] list in | |
if (List.length list') > limit then (List.tl list'),limit | |
else list',limit | |
exception Queue_is_empty | |
let pop = function | |
| [],_ -> raise Queue_is_empty | |
| hd::tl,limit -> (hd, (tl,limit)) | |
end | |
type node = { | |
size: int; | |
checksum: string; | |
name: string; | |
} | |
let maybe_add s md5 path m = | |
let l = try M.find md5 m with Not_found -> [] in | |
let node = {size=s; checksum=md5; name=path} in | |
(s, md5, (M.add md5 (node::l) m));; | |
let rec search_dup_fold m path = | |
let dirs = Sys.readdir path in | |
let rec fold total_size acc_md5 acc_m = function | |
| [] -> total_size, acc_md5, acc_m; | |
| hd::tl -> | |
begin | |
let filename = Filename.concat path hd in | |
let s, md5, m2 = handle_name acc_m filename in | |
let md5' = Digest.to_hex (Digest.string (md5 ^ acc_md5)) in | |
fold (total_size + s) (md5') m2 tl | |
end | |
in | |
let s,md5,m2 = fold 0 "empty directory!!" m (Array.to_list dirs) in | |
(* Printf.printf "directory: %s (%s, %d)\n" path md5 s; *) | |
maybe_add s md5 path m2 | |
and handle_name m path = | |
try | |
if Sys.is_directory path then | |
search_dup_fold m path | |
else | |
(* TODO: calculating md5 is just heavy *) | |
let md5 = Digest.to_hex (Digest.file path) in | |
let stat = Unix.stat path in | |
let size = stat.Unix.st_size in | |
(* Printf.printf "file: %s (%s, %d)\n" path md5 size; *) | |
maybe_add size md5 path m | |
with | |
Sys_error str -> | |
Printf.eprintf ">> %s\n" str; | |
(0, "", m) | |
let search_dup path = | |
Printf.printf "searching for dup files in %s\n" path; | |
let (_, _, t) = handle_name M.empty path in | |
t;; | |
let print_tree t = | |
(* List.map (fun s -> print_endline s) t;; *) | |
M.iter (fun k nodes -> | |
Printf.printf "\n%s: " k; | |
List.iter (fun n -> Printf.printf "(%s %d)," n.name n.size) nodes) | |
t | |
let sort_tree_by_size tree = | |
let rec sort k nodes q = | |
if List.length nodes > 1 then | |
let n = List.hd nodes in | |
let size = n.size * ((List.length nodes) - 1) in | |
LimitQ.insert q size nodes | |
else q | |
in | |
M.fold sort tree (LimitQ.empty 64);; | |
let pp_int i = match i with | |
| _ when i > 1000000 -> (string_of_int (i / 1000)) ^ " KB"; | |
| _ when i > 1000 -> (string_of_int i) ^ " B"; | |
| i -> (string_of_int i) ^ " B ";; | |
let rec print_list q = | |
try | |
let (p,nodes),nextq = LimitQ.pop q in | |
let n = List.hd nodes in | |
Printf.printf "%s (%s):\n" (String.sub n.checksum 0 9) (pp_int p); | |
List.iter (fun n -> Printf.printf "\t%s\n" n.name) nodes; | |
print_list nextq | |
with LimitQ.Queue_is_empty -> () | |
let main () = | |
let path = | |
if Array.length Sys.argv = 1 then "." | |
else Sys.argv.(1) | |
in | |
let tree = search_dup path in | |
(* print_tree tree; *) | |
print_endline ""; | |
print_list (sort_tree_by_size tree);; | |
let _ = | |
main ();; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment