Skip to content

Instantly share code, notes, and snippets.

Created February 20, 2011 23:16
Show Gist options
  • Save anonymous/836413 to your computer and use it in GitHub Desktop.
Save anonymous/836413 to your computer and use it in GitHub Desktop.
Find (and trash) duplicates.
#!/usr/bin/python
import sys
import os
from collections import defaultdict
import hashlib
theRoot = '/Volumes/Stuff/Files/'
theTrashFolder = '/Volumes/Stuff/Trash/'
class File(object):
signature_lengths = [None, 1024, 64 * 1025, 256 * 1024, -1]
def __init__(self, path):
self.path = path
self._signatures = []
for x in xrange(len(File.signature_lengths)):
self._signatures.append(None)
def __repr__(self):
return self.path
def signatures(self, count = None):
if count:
count = min(count, len(File.signature_lengths))
for n in xrange(count):
if not self._signatures[n]:
theLength = File.signature_lengths[n]
if not theLength:
theSignature = os.path.getsize(self.path)
else:
theData = file(self.path).read(theLength)
theSignature = hashlib.md5(theData).hexdigest()
self._signatures[n] = theSignature
return self._signatures[:count]
def trash(self):
theRelativePath = self.path[len(theRoot):]
theTrashPath = os.path.join(theTrashFolder, theRelativePath)
print theTrashPath
print os.path.split(theTrashPath)[0]
if not os.path.exists(os.path.split(theTrashPath)[0]):
os.makedirs(os.path.split(theTrashPath)[0])
os.rename(self.path, theTrashPath)
self.path = theTrashPath
def walk(path):
for root, dirs, files in os.walk(path):
for f in files:
yield (root, f)
files = [(p, n) for p, n in walk(theRoot) if n[0] != '.']
files = [os.path.join(p, n) for p, n in files]
files = [File(f) for f in files]
for n in xrange(len(File.signature_lengths)):
d = defaultdict(list)
for f in files:
signature = ','.join([str(s) for s in f.signatures(n)])
d[signature].append(f)
for k, v in d.items():
if len(v) <= 1:
del d[k]
files = []
for v in d.values():
files.extend(v)
for k, v in d.items():
print '*' * 40
for f in v[1:]:
print 'Trashing:', f
f.trash()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment