Skip to content

Instantly share code, notes, and snippets.

@noblethrasher
Created November 16, 2012 21:57
Show Gist options
  • Save noblethrasher/4091269 to your computer and use it in GitHub Desktop.
Save noblethrasher/4091269 to your computer and use it in GitHub Desktop.
Finds duplicate files on a Windows system and generates a simple report.
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Security.Cryptography;
using System.Diagnostics;
namespace DuplicateFileFinder
{
abstract class DataUnit
{
protected readonly long length;
public DataUnit(long length)
{
this.length = length;
}
public static DataUnit GetAppropriateUnit(long length)
{
if (length >= 1024 * 1024 * 1024)
return new Gigabyte (length);
if (length >= 1024 * 1024)
return new Megabyte (length);
if (length >= 1024)
return new Kilobyte (length);
return new Byte (length);
}
sealed class Byte : DataUnit
{
public Byte(long length) : base (length) { }
public override string ToString()
{
return length + ((length == 1) ? " byte" : " bytes");
}
}
sealed class Kilobyte : DataUnit
{
public Kilobyte(long length) : base (length) { }
public override string ToString()
{
return length / (1024) + " KB";
}
}
sealed class Megabyte : DataUnit
{
public Megabyte(long length) : base (length) { }
public override string ToString()
{
return length / (1024L * 1024) + " MB";
}
}
sealed class Gigabyte : DataUnit
{
public Gigabyte(long length) : base (length) { }
public override string ToString()
{
return length / (1024L * 1024 * 1024) + " GB";
}
}
}
abstract class TimeUnit
{
protected readonly int length;
protected const int MINUTE = 60;
protected const int HOUR = 60 * MINUTE;
protected const int DAY = 24 * HOUR;
protected const int WEEK = 7 * DAY;
protected const int MONTH = 30 * DAY;
public TimeUnit(int length)
{
this.length = length;
}
public static TimeUnit GetApproriateUnit(int length)
{
if (length >= MONTH)
return new Month (length);
if (length >= WEEK)
return new Week (length);
if (length >= DAY)
return new Day (length);
if (length >= HOUR)
return new Hour (length);
if (length >= MINUTE)
return new Minute (length);
return new Second (length);
}
public abstract override string ToString();
protected string Report(int denomination, string name)
{
var k = length / denomination;
return k + " " + name + (k == 1 ? "" : "s");
}
sealed class Month : TimeUnit
{
public Month(int length) : base (length) { }
public override string ToString()
{
return Report (MONTH, "month");
}
}
sealed class Week : TimeUnit
{
public Week(int length) : base (length) { }
public override string ToString()
{
return Report (WEEK, "week");
}
}
sealed class Day : TimeUnit
{
public Day(int length) : base (length) { }
public override string ToString()
{
return Report (DAY, "day");
}
}
sealed class Hour : TimeUnit
{
public Hour(int length) : base (length) { }
public override string ToString()
{
return Report (HOUR, "hour");
}
}
sealed class Minute : TimeUnit
{
public Minute(int length) : base (length) { }
public override string ToString()
{
return Report (MINUTE, "minute");
}
}
sealed class Second : TimeUnit
{
public Second(int length) : base (length) { }
public override string ToString()
{
return Report (1, "second");
}
}
}
class Program
{
static readonly MD5 md5 = MD5.Create ();
static Dictionary<string, FileGroup> files = new Dictionary<string, FileGroup> ();
static readonly string border;
class FileGroup : IEquatable<FileGroup>
{
public readonly string Hash;
public readonly List<FileInfo> files = new List<FileInfo> ();
public readonly static List<string> problem_files = new List<string> ();
public FileGroup(FileInfo info)
{
try
{
using (var fs = new FileStream (info.FullName, FileMode.Open, FileAccess.ReadWrite, FileShare.ReadWrite))
Hash = Convert.ToBase64String (md5.ComputeHash (fs));
files.Add (info);
}
catch (Exception ex)
{
problem_files.Add (info.FullName + " " + ex.Message);
}
}
public FileGroup(string hash, FileInfo info)
{
this.Hash = hash;
files.Add (info);
}
private FileGroup(string hash)
{
this.Hash = hash;
}
public void Add(FileInfo info)
{
files.Add (info);
}
public long ExcessSize
{
get
{
return (files.Count - 1) * files[0].Length;
}
}
public override int GetHashCode()
{
return md5.GetHashCode ();
}
public override bool Equals(object obj)
{
var other = obj as FileGroup;
if (other != null && other.Hash == this.Hash)
return true;
else
return false;
}
public bool Equals(FileGroup other)
{
return other.Hash == this.Hash;
}
public static implicit operator FileGroup(string s)
{
return new FileGroup (s);
}
}
static Program()
{
var chars = new char[80];
for (var i = 0; i < 80; i++)
chars[i] = '-';
border = new string (chars);
}
static void Main(string[] args)
{
string disposition = "yes";
var stopwatch = new Stopwatch ();
do
{
stopwatch.Reset ();
Console.Clear ();
FileGroup.problem_files.Clear ();
Console.WriteLine ("Directory:");
var root = Console.ReadLine ();
if (Directory.Exists (root))
{
stopwatch.Start ();
GetFiles (root);
stopwatch.Stop ();
var time_taken = TimeUnit.GetApproriateUnit ((int)(stopwatch.ElapsedMilliseconds / 1000)).ToString ();
Console.WriteLine ();
Console.WriteLine ("Scanned " + files.Count + " files in " + time_taken);
var xs = new List<FileGroup> ();
foreach (var grp in files.Values)
if (grp.files.Count > 1)
xs.Add (grp);
var dupes = xs.ToArray ();
Console.WriteLine (dupes.Length + " files with duplicates");
if (dupes.Length > 0)
{
var desktop_path = Environment.GetFolderPath (Environment.SpecialFolder.Desktop);
using (var sw = new StreamWriter (Path.Combine (desktop_path, "Duplicate File Report.txt"), true))
{
sw.WriteLine ("Duplicate files in " + root + " (" + DateTime.Now.ToString () + ")".ToUpper ());
sw.WriteLine (border);
sw.WriteLine ();
DataUnit excess = null;
foreach (var dupe in dupes)
{
excess = DataUnit.GetAppropriateUnit (dupe.ExcessSize);
sw.WriteLine (dupe.files[0].FullName);
for (var i = 1; i < dupe.files.Count; i++)
sw.WriteLine ("\t" + dupe.files[i].FullName);
sw.WriteLine ();
}
sw.WriteLine ();
sw.WriteLine ("Excess: " + excess);
sw.WriteLine ();
}
}
if (FileGroup.problem_files.Count > 0)
Console.WriteLine ("There was a problem reading or opening " + FileGroup.problem_files.Count + " file" + ((FileGroup.problem_files.Count > 1) ? "s" : "") + "or " + (FileGroup.problem_files.Count > 0 ? "directory" : "directories"));
}
else
{
Console.WriteLine ("Cannot find that directory (it might be misspelled).");
Console.WriteLine ();
}
Console.WriteLine ("Search other directories (y/n)?");
} while ((disposition = Console.ReadLine ()).StartsWith ("y", StringComparison.OrdinalIgnoreCase));
}
static void GetFiles(string root)
{
Console.WriteLine ("Pattern:");
var patterns = Console.ReadLine ();
if (patterns.Length < 1)
patterns = "*";
var stack = new Stack<DirectoryInfo> (new[] { new DirectoryInfo (root) });
var default_color = Console.ForegroundColor;
while (stack.Count > 0)
{
var dir = stack.Pop ();
try
{
foreach (var pattern in patterns.Split (new[] { ';' }, StringSplitOptions.RemoveEmptyEntries))
{
foreach (var file in dir.GetFiles (pattern))
{
try
{
using (var sr = file.Open (FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
var hash = Convert.ToBase64String (md5.ComputeHash (sr));
if (files.ContainsKey (hash))
{
files[hash].files.Add (file);
Console.ForegroundColor = ConsoleColor.Red;
}
else
{
files.Add (hash, new FileGroup (hash, file));
}
Console.WriteLine (file.FullName + " " + hash);
Console.ForegroundColor = default_color;
}
}
catch (Exception ex1)
{
FileGroup.problem_files.Add (file.FullName + " " + ex1.Message);
}
}
}
foreach (var sub in Utils.Reverse(dir.GetDirectories ()))
{
stack.Push (sub);
}
}
catch (Exception ex)
{
}
}
}
}
public static class Utils
{
public static IEnumerable<T> Reverse<T>(IEnumerable<T> xs)
{
if (xs is T[] && (!typeof (T).IsValueType))
{
var array = xs as T[];
var copy = new T[array.Length];
for (var i = array.Length - 1; i >= 0; i--)
{
copy[array.Length - 1 - i] = array[i];
}
return copy;
}
if (xs is List<T>)
{
var list = xs as List<T>;
var copy = new List<T> (list);
copy.Reverse ();
return copy;
}
return Reverse (xs.GetEnumerator ());
}
private static IEnumerable<T> Reverse<T>(IEnumerator<T> enumerator)
{
var stack = new Stack<T> ();
while (enumerator.MoveNext ())
stack.Push (enumerator.Current);
foreach (var t in stack)
yield return t;
}
}
}
@noblethrasher
Copy link
Author

Just a simple console app for finding duplicate files (based on the MD5 signature).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment