Skip to content

Instantly share code, notes, and snippets.

@terjetyl
Last active August 29, 2015 14:27
Show Gist options
  • Save terjetyl/40adeef1c1c577089af6 to your computer and use it in GitHub Desktop.
Save terjetyl/40adeef1c1c577089af6 to your computer and use it in GitHub Desktop.
open System
open System.IO
type File =
{ Size: Int64; Name: String; Path: String; }
member x.IsLargerThan minSize = x.Size > minSize
type DuplicateFile =
{ Size: Int64; Name: String; Paths: String list; }
let getFileInfo path =
let fi = new FileInfo(path)
{ Name = fi.Name; Size = fi.Length; Path = fi.FullName }
let findAllFilesRecursively path =
Directory.EnumerateFiles(path, "*.*", SearchOption.AllDirectories) |> Seq.map getFileInfo
let filterBySize minSizeInBytes (files:seq<File>) =
files |> Seq.filter (fun file -> file.IsLargerThan minSizeInBytes)
let getDuplicates groupByFunc (files:seq<File>) =
files |> Seq.groupBy groupByFunc |> Seq.filter(fun x -> Seq.length (snd x) > 1) |> Seq.map (fun x -> snd x)
let findDuplicatesByNameAndSize files =
files |> getDuplicates (fun x -> x.Size, x.Name)
let findDuplicatesByContent files =
let findDuplicateByteArrays f =
f |> getDuplicates (fun x -> x.Path |> File.ReadAllBytes)
files
|> Seq.map (fun x -> findDuplicateByteArrays x)
|> Seq.concat
let organizeResults files =
let mapToDuplicate (f:seq<File>) : DuplicateFile =
let h = f |> Seq.head
{ Name = h.Name; Size = h.Size; Paths = (f |> Seq.map (fun y -> y.Path) |> Seq.toList) }
files
|> Seq.map mapToDuplicate
let printResult fileList =
fileList |> Seq.iter (fun file -> printfn "Name: %s, Size: %d" file.Name file.Size)
let printResults (results:seq<DuplicateFile>) =
let printLine fileName size count (paths:string list) =
printfn "%s, size %i bytes found in %i places: \n - %s" fileName size count (String.Join("\n - ", paths))
match Seq.length results with
| 0 -> printfn "No duplicates found"
| _ -> results |> Seq.iter (fun x -> printLine x.Name x.Size (Seq.length x.Paths) x.Paths)
let parseArgs (argv:string[]) =
if argv.Length <> 2 then
failwith "Need 2 arguments, minSizeInBytes and a valid folderpath"
let parsedOk, minSizeInBytes = Int64.TryParse(argv.[0])
if not parsedOk then
failwith "First argument is not a valid number"
if minSizeInBytes < 0L then
failwith "minSizeInBytes must be a positive number"
let folder = argv.[1]
if not (Directory.Exists(folder)) then
failwith (sprintf "cannot find folder: %s" folder)
(minSizeInBytes, folder.Replace("/", "//"))
let findDuplicateFiles minSizeInBytes folder =
folder
|> findAllFilesRecursively
|> filterBySize minSizeInBytes
|> findDuplicatesByNameAndSize
|> findDuplicatesByContent
|> organizeResults
/// find duplicate files over a minimum size, throws exception on faulty args or missing access to folderpath
[<EntryPoint>]
let main argv =
let st = new System.Diagnostics.Stopwatch()
st.Start()
printfn "Job started"
let minSizeInBytes, folder = parseArgs argv
findDuplicateFiles minSizeInBytes folder |> printResults
st.Stop()
printfn "Job ended in %i ms" st.ElapsedMilliseconds
0 // return an integer exit code
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment