Skip to content

Instantly share code, notes, and snippets.

@icedraco
Created November 27, 2014 11:32
Show Gist options
  • Save icedraco/126c6f947fc02db063dc to your computer and use it in GitHub Desktop.
Save icedraco/126c6f947fc02db063dc to your computer and use it in GitHub Desktop.
A small Python script that uses MD5 hashing to find duplicate files in different paths/collections. Originally made to weed through porn imagery...
##
# Path Compare Utility (20120729)
#
# A script example to help sort through different porn collections and weed out
# duplicate files between them!
#
# Syntax: python path-compare.py <target_path> <src_path> [src_path] [src_path] ...
#
# NOTE: There are some functions here that aren't used in main(). I left them
# there to be used through ipython or some other interactive shell.
#
# NOTE: DO NOT USE WITH LARGE FILES! The MD5 hashing involved reads each file
# into RAM before processing it rather than portions of it! I'm lazy. :P
#
# Author: IceDragon <icedragon@quickfox.org>
# Contact: http://www.icerealm.org/contact
#
import os.path
from sys import argv
from md5 import md5
def processPath(container, dirname, filenames):
print " Scanning path: " + dirname
for name in filenames:
full_path = os.path.join(dirname, name)
if os.path.isfile(full_path):
checksum = md5(open(full_path).read()).hexdigest()
if checksum in container:
container[checksum] += [full_path]
else:
container[checksum] = [full_path]
def findLocalDuplicates(md5map):
result = {}
for checksum in md5map:
if len(md5map[checksum]) > 1:
result[checksum] = md5map[checksum]
return result
def findForeignDuplicates(target_md5, source_md5):
result = {}
for checksum in source_md5:
if checksum in target_md5:
result[checksum] = source_md5[checksum]
return result
def comparePaths(target, source_list, handler):
# Stabilize source_list from all potential valid input types
if type(source_list) is str:
source_list = [ source_list ]
if type(source_list) is not list:
source_list = list(source_list)
# Check source paths so we don't crash on missing stuff mid-way
print "* Checking source and target paths..."
for path in source_list + [target]:
if not os.path.isdir(path):
raise Exception("Path not found: %s" % path)
# Good to go - start grabbing checksums from files in
target_md5 = {}
print "* Scanning files from target path..."
os.path.walk(target, processPath, target_md5)
# Now go through the sources
source_md5 = {}
for src_path in source_list:
print "* Scanning files from source path: %s..." % src_path
os.path.walk(src_path, processPath, source_md5)
# Weed out the duplicates now.
print "* Processing foreign duplicates..."
duplicates = findForeignDuplicates(target_md5, source_md5)
# And do something with them...
for path_list in duplicates.values():
for path in path_list:
handler(path)
print "* Done"
def handler_print(path, checksum = None):
print path
return True
def handler_delete(path, checksum = None):
try:
os.unlink(path)
except Exception,e:
print "ERROR: Cannot delete '%s': %s" % (path, e)
return False
print "rm " + path
return True
def main(argv):
if len(argv) <= 2:
print "Syntax: %s <target_path> <source_path> [source_path] [source_path] ..." % argv[0]
return 0
target = argv[1]
source = argv[2:]
try:
comparePaths(target, source, handler_print)
except Exception,e:
print "FATAL: %s" % e
return 1
return 0
### INIT ###
if __name__ == "__main__":
raise SystemExit(main(argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment