Skip to content

Instantly share code, notes, and snippets.

@bjartwolf
Created August 24, 2015 08:33
Show Gist options
  • Save bjartwolf/d4d83eda3c2f18bf3ce7 to your computer and use it in GitHub Desktop.
Save bjartwolf/d4d83eda3c2f18bf3ce7 to your computer and use it in GitHub Desktop.
open System
open System.IO
type FileWithLength = {Path: string;
Length: int64}
type FileWithLengthAndStreamBuffer = {FileWithLength: FileWithLength;
Str: System.IO.FileStream;
Buf: byte[]}
// Ignore long files, they are annoying on Windows
let listfiles path =
let files = try Directory.GetFiles(path)
with _ -> printfn "Access denied to %A" path
Array.empty
files |> Array.filter (fun f -> f.Length < 259) |> Array.map (fun f -> {Path = f;
Length = (new FileInfo(f)).Length})
let filterBySize minSize (files: FileWithLength []) =
files |> Array.filter (fun f -> f.Length >= minSize)
// Ignore dirs we can't read too
let rec getAllFiles path minSize = seq {
yield! listfiles path |> filterBySize minSize
let subdirs = try Directory.GetDirectories(path)
with _ -> printfn "Access denied to %A" path
Array.empty
for subdir in subdirs do
yield! getAllFiles subdir minSize
}
let bufferlength = 4096// should be small to avoid objects on loh?
let readToBuffer files =
for s in files do
for f in s do
// check later if at end of files, all have same length so better check only once
f.Str.Read(f.Buf,0,bufferlength) |> ignore
let rec splitOpenFilesOfEqualLengthByContentStreaming files =
readToBuffer files
let groupedByBufferContent = files |> List.map (fun s -> s |> Seq.groupBy (fun f -> f.Buf))
|> Seq.concat |> Seq.toList
|> List.map (snd) |> List.map (Seq.toList)
|> List.filter (fun x -> x.Length > 1)
if groupedByBufferContent.IsEmpty then
[]
else
let oneFile = groupedByBufferContent |> List.head |> List.head
if oneFile.Str.Position = oneFile.Str.Length then
groupedByBufferContent
else
splitOpenFilesOfEqualLengthByContentStreaming groupedByBufferContent
// ignores files we can't open for read because they are locked or something.
let splitFilesOfEqualLengthByContentStreaming (files:FileWithLength list):FileWithLength list list =
let disposeStreams = Seq.iter (fun l -> l |> Seq.iter(fun f -> f.Str.Dispose()))
let pickInnerValue = List.map (fun s -> s |> List.map (fun f -> f.FileWithLength))
let openStreams = [files |> List.map (fun f -> try Some {FileWithLength = f
Str = new FileStream(f.Path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)
Buf = Array.zeroCreate<byte> bufferlength }
with _ -> None ) |> List.choose id ]
let returnVal = splitOpenFilesOfEqualLengthByContentStreaming openStreams
|> pickInnerValue
|> List.filter (fun x -> x.Length > 1)
openStreams |> disposeStreams
returnVal
let findFilesOfSameSize path minSize =
getAllFiles path minSize
|> Seq.groupBy (fun f -> f.Length)
|> Seq.map (snd)
|> Seq.filter (fun f -> f |> Seq.length > 1)
let findEqualFiles path minSize =
findFilesOfSameSize path minSize
|> Seq.map (Seq.toList)
|> Seq.map (splitFilesOfEqualLengthByContentStreaming)
|> Seq.concat
let prettyPrintStream (input: FileWithLength list seq) =
for equalFiles in input do
let filename = (equalFiles |> Seq.head).Length
printfn "\r\n"
printfn "Fillength: %A" filename
printfn "******************"
for files in equalFiles do
printfn "%A" files.Path
()
[<EntryPoint>]
let main argv =
try
if not (argv.Length = 2) then failwith "First argument is min filesize in bytes and second is path"
let couldParse, minBytes = Int64.TryParse(argv.[0])
if (not couldParse) then failwith "First argument is min filesize to scan"
let dir = new IO.DirectoryInfo(argv.[1])
if not (dir.Exists) then failwith (sprintf "Directory %s does not exist" dir.FullName)
prettyPrintStream (findEqualFiles dir.FullName minBytes)
Console.WriteLine("DONE")
Console.ReadKey() |> ignore
0 // return an integer exit code
with ex ->
printfn "%A" ex
1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment