Skip to content

Instantly share code, notes, and snippets.

@danielcbaldwin
Created February 17, 2013 06:18
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danielcbaldwin/4970429 to your computer and use it in GitHub Desktop.
Save danielcbaldwin/4970429 to your computer and use it in GitHub Desktop.
Python script to remove duplicate images / files.
#! /usr/bin/python
# Got this from: http://code.activestate.com/recipes/362459/
import os
import sys
import stat
import md5
filesBySize = {}
def walker(arg, dirname, fnames):
d = os.getcwd()
os.chdir(dirname)
try:
fnames.remove('Thumbs')
except ValueError:
pass
for f in fnames:
if not os.path.isfile(f):
continue
size = os.stat(f)[stat.ST_SIZE]
if size < 100:
continue
if filesBySize.has_key(size):
a = filesBySize[size]
else:
a = []
filesBySize[size] = a
a.append(os.path.join(dirname, f))
os.chdir(d)
for x in sys.argv[1:]:
print 'Scanning directory "%s"....' % x
os.path.walk(x, walker, filesBySize)
print 'Finding potential dupes...'
potentialDupes = []
potentialCount = 0
trueType = type(True)
sizes = filesBySize.keys()
sizes.sort()
for k in sizes:
inFiles = filesBySize[k]
outFiles = []
hashes = {}
if len(inFiles) is 1: continue
print 'Testing %d files of size %d...' % (len(inFiles), k)
for fileName in inFiles:
if not os.path.isfile(fileName):
continue
aFile = file(fileName, 'r')
hasher = md5.new(aFile.read(1024))
hashValue = hasher.digest()
if hashes.has_key(hashValue):
x = hashes[hashValue]
if type(x) is not trueType:
outFiles.append(hashes[hashValue])
hashes[hashValue] = True
outFiles.append(fileName)
else:
hashes[hashValue] = fileName
aFile.close()
if len(outFiles):
potentialDupes.append(outFiles)
potentialCount = potentialCount + len(outFiles)
del filesBySize
print 'Found %d sets of potential dupes...' % potentialCount
print 'Scanning for real dupes...'
dupes = []
for aSet in potentialDupes:
outFiles = []
hashes = {}
for fileName in aSet:
print 'Scanning file "%s"...' % fileName
aFile = file(fileName, 'r')
hasher = md5.new()
while True:
r = aFile.read(4096)
if not len(r):
break
hasher.update(r)
aFile.close()
hashValue = hasher.digest()
if hashes.has_key(hashValue):
if not len(outFiles):
outFiles.append(hashes[hashValue])
outFiles.append(fileName)
else:
hashes[hashValue] = fileName
if len(outFiles):
dupes.append(outFiles)
i = 0
for d in dupes:
print 'Original is %s' % d[0]
for f in d[1:]:
i = i + 1
print 'Deleting %s' % f
os.remove(f)
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment