Last active
August 29, 2015 14:27
-
-
Save terjetyl/40adeef1c1c577089af6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open System | |
open System.IO | |
type File = | |
{ Size: Int64; Name: String; Path: String; } | |
member x.IsLargerThan minSize = x.Size > minSize | |
type DuplicateFile = | |
{ Size: Int64; Name: String; Paths: String list; } | |
let getFileInfo path = | |
let fi = new FileInfo(path) | |
{ Name = fi.Name; Size = fi.Length; Path = fi.FullName } | |
let findAllFilesRecursively path = | |
Directory.EnumerateFiles(path, "*.*", SearchOption.AllDirectories) |> Seq.map getFileInfo | |
let filterBySize minSizeInBytes (files:seq<File>) = | |
files |> Seq.filter (fun file -> file.IsLargerThan minSizeInBytes) | |
let getDuplicates groupByFunc (files:seq<File>) = | |
files |> Seq.groupBy groupByFunc |> Seq.filter(fun x -> Seq.length (snd x) > 1) |> Seq.map (fun x -> snd x) | |
let findDuplicatesByNameAndSize files = | |
files |> getDuplicates (fun x -> x.Size, x.Name) | |
let findDuplicatesByContent files = | |
let findDuplicateByteArrays f = | |
f |> getDuplicates (fun x -> x.Path |> File.ReadAllBytes) | |
files | |
|> Seq.map (fun x -> findDuplicateByteArrays x) | |
|> Seq.concat | |
let organizeResults files = | |
let mapToDuplicate (f:seq<File>) : DuplicateFile = | |
let h = f |> Seq.head | |
{ Name = h.Name; Size = h.Size; Paths = (f |> Seq.map (fun y -> y.Path) |> Seq.toList) } | |
files | |
|> Seq.map mapToDuplicate | |
let printResult fileList = | |
fileList |> Seq.iter (fun file -> printfn "Name: %s, Size: %d" file.Name file.Size) | |
let printResults (results:seq<DuplicateFile>) = | |
let printLine fileName size count (paths:string list) = | |
printfn "%s, size %i bytes found in %i places: \n - %s" fileName size count (String.Join("\n - ", paths)) | |
match Seq.length results with | |
| 0 -> printfn "No duplicates found" | |
| _ -> results |> Seq.iter (fun x -> printLine x.Name x.Size (Seq.length x.Paths) x.Paths) | |
let parseArgs (argv:string[]) = | |
if argv.Length <> 2 then | |
failwith "Need 2 arguments, minSizeInBytes and a valid folderpath" | |
let parsedOk, minSizeInBytes = Int64.TryParse(argv.[0]) | |
if not parsedOk then | |
failwith "First argument is not a valid number" | |
if minSizeInBytes < 0L then | |
failwith "minSizeInBytes must be a positive number" | |
let folder = argv.[1] | |
if not (Directory.Exists(folder)) then | |
failwith (sprintf "cannot find folder: %s" folder) | |
(minSizeInBytes, folder.Replace("/", "//")) | |
let findDuplicateFiles minSizeInBytes folder = | |
folder | |
|> findAllFilesRecursively | |
|> filterBySize minSizeInBytes | |
|> findDuplicatesByNameAndSize | |
|> findDuplicatesByContent | |
|> organizeResults | |
/// find duplicate files over a minimum size, throws exception on faulty args or missing access to folderpath | |
[<EntryPoint>] | |
let main argv = | |
let st = new System.Diagnostics.Stopwatch() | |
st.Start() | |
printfn "Job started" | |
let minSizeInBytes, folder = parseArgs argv | |
findDuplicateFiles minSizeInBytes folder |> printResults | |
st.Stop() | |
printfn "Job ended in %i ms" st.ElapsedMilliseconds | |
0 // return an integer exit code |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment