Skip to content

Instantly share code, notes, and snippets.

@mastoj
Last active August 29, 2015 14:27
Show Gist options
  • Save mastoj/83b0e0188fcec69cd3fb to your computer and use it in GitHub Desktop.
Save mastoj/83b0e0188fcec69cd3fb to your computer and use it in GitHub Desktop.
open System
[<AutoOpen>]
module FileHelper =
type FileCheck =
{
FileName: string
Size: int64
}
let directory dir = new System.IO.DirectoryInfo(dir)
let getFileBytes (file:System.IO.FileInfo) =
use fileStream = file.OpenRead()
let byteArr = Array.init (int file.Length) (fun _ -> new System.Byte())
let readBytes = fileStream.Read(byteArr, 0, (int file.Length))
byteArr
let createFileCheck (file:System.IO.FileInfo) =
try
Some {
FileName = file.FullName
Size = file.Length
}
with
| :? System.IO.PathTooLongException as e ->
printfn "File error: %s" e.Message
None
let rec getFiles filter (dir:System.IO.DirectoryInfo) =
seq {
yield! dir.GetFiles()
|> Seq.filter filter
|> Seq.map createFileCheck
|> Seq.filter Option.isSome
|> Seq.map (fun (Some f) -> f)
yield! dir.GetDirectories()
|> Seq.map (getFiles filter)
|> Seq.concat
}
let limitSize limit (file:System.IO.FileInfo) = file.Length < limit
[<AutoOpen>]
module Parsing =
type Result<'T> =
| Success of 'T
| Fail of string
let bind f = function
| Success x -> f x
| Fail s -> Fail s
let (>>=) r f = bind f r
let validateArgLenth (argv:string[]) =
if argv.Length <> 2 then Fail (sprintf "Expected 2 arguments got %i" argv.Length)
else Success argv
let parseArg1 (argv:string[]) =
let (parseOk, value) = System.Int64.TryParse(argv.[0])
if parseOk then Success value
else Fail (sprintf "Invalid limit argument %s" argv.[0])
let parseArg2 (argv:string[]) value =
let dir = argv.[1] |> directory
if dir.Exists then Success (value, dir)
else Fail (sprintf "Invalid directory %s" argv.[1])
let parse argv =
argv |> validateArgLenth >>= parseArg1 >>= (parseArg2 argv)
[<AutoOpen>]
module IOExtensions =
// Stuff copied with pride from: http://www.fssnip.net/1k
open System.IO
type AsyncSeq<'T> = Async<AsyncSeqInner<'T>>
and AsyncSeqInner<'T> =
| Ended
| Item of 'T * AsyncSeq<'T>
let readInBlocks fn size = async {
let stream = File.OpenRead(fn)
let buffer = Array.zeroCreate size
let rec nextBlock() = async {
let! count = stream.AsyncRead(buffer, 0, size)
if count = 0 then return Ended
else
let res =
if count = size then buffer
else buffer |> Seq.take count |> Array.ofSeq
return Item(res, nextBlock()) }
return! nextBlock() }
let rec compareAsyncSeqs seq1 seq2 = async {
let! item1 = seq1
let! item2 = seq2
match item1, item2 with
| Item(b1, ns1), Item(b2, ns2) when b1 <> b2 -> return false
| Item(b1, ns1), Item(b2, ns2) -> return! compareAsyncSeqs ns1 ns2
| Ended, Ended -> return true
| _ -> return failwith "Size doesn't match" }
[<AutoOpen>]
module Printing =
// Colored printfn
let cprintfn c fmt =
Printf.kprintf
(fun s ->
let old = System.Console.ForegroundColor
try
System.Console.ForegroundColor <- c;
System.Console.WriteLine s
finally
System.Console.ForegroundColor <- old)
fmt
let printFileCheck fileCheck =
cprintfn ConsoleColor.Yellow "\tFile: %s" fileCheck.FileName
let print group =
let head = group |> Seq.head
cprintfn ConsoleColor.Green "File %s has the same content as" head.FileName
group |> Seq.skip 1 |> Seq.iter printFileCheck
let printResult (startTime:DateTime) result =
let resultList = result |> Seq.map (fun x -> x |> Seq.toList) |> Seq.toList
let timeFormat (dateTime:DateTime) = dateTime.ToString("HH:mm:ss.fffff")
let stopTime = DateTime.Now
let duration = stopTime - startTime
printfn "============== RESULT =============="
resultList |> Seq.iter print
printfn "============== END RESULT =============="
printfn "============== SUMMARY =============="
cprintfn ConsoleColor.Green "Number of groups with duplicates: %i" (resultList |> List.length)
cprintfn ConsoleColor.Green "Total number of duplicates: %i" (resultList |> List.sumBy List.length)
cprintfn ConsoleColor.Green "Start time: %s" (startTime |> timeFormat)
cprintfn ConsoleColor.Green "Stop time: %s" (stopTime |> timeFormat)
cprintfn ConsoleColor.Green "Duration: %f s" duration.TotalSeconds
printfn "============== END SUMMARY =============="
[<AutoOpen>]
module DuplicateSearch =
let findDuplicatesInGroup group =
let compareFileChecks fc1 fc2 =
let s1 = readInBlocks fc1.FileName 1000
let s2 = readInBlocks fc2.FileName 1000
printfn "Comparing '%s' with '%s" fc1.FileName fc2.FileName
compareAsyncSeqs s1 s2 |> Async.RunSynchronously
let rec find remaining res =
match remaining with
| [] -> res
| _ ->
let candidate = remaining |> Seq.head
let tailSet = remaining |> Seq.skip 1 |> Set.ofSeq
let dups = tailSet |> Seq.filter (compareFileChecks candidate) |> Seq.toList
match dups with
| [] -> find (remaining |> List.tail) res
| xs ->
let remaining' = tailSet - (dups |> Set.ofList) |> Seq.toList
find remaining' ((candidate::dups)::res)
find (group |> Seq.toList) []
let groupFilesBySize = Seq.groupBy (fun s -> s.Size)
let removeKeyFromGroups = Seq.map (fun (k, g) -> g)
let findDuplicatesInGroups = Seq.map findDuplicatesInGroup
let joinDuplicateGroups = Seq.concat
let findDuplicates sizeLimit dir =
dir
|> getFiles (limitSize sizeLimit)
|> groupFilesBySize
|> removeKeyFromGroups
|> findDuplicatesInGroups
|> joinDuplicateGroups
[<EntryPoint>]
let main argv =
let parseResult = parse argv
let startTime = System.DateTime.Now
match parseResult with
| Fail str -> printfn "%s" str
| Success (limit, dir) -> findDuplicates limit dir |> printResult startTime
0 // return an integer exit code
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment