Created
November 27, 2014 11:32
-
-
Save icedraco/126c6f947fc02db063dc to your computer and use it in GitHub Desktop.
A small Python script that uses MD5 hashing to find duplicate files in different paths/collections. Originally made to weed through porn imagery...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## | |
# Path Compare Utility (20120729) | |
# | |
# A script example to help sort through different porn collections and weed out | |
# duplicate files between them! | |
# | |
# Syntax: python path-compare.py <target_path> <src_path> [src_path] [src_path] ... | |
# | |
# NOTE: There are some functions here that aren't used in main(). I left them | |
# there to be used through ipython or some other interactive shell. | |
# | |
# NOTE: DO NOT USE WITH LARGE FILES! The MD5 hashing involved reads each file | |
# into RAM before processing it rather than portions of it! I'm lazy. :P | |
# | |
# Author: IceDragon <icedragon@quickfox.org> | |
# Contact: http://www.icerealm.org/contact | |
# | |
import os.path | |
from sys import argv | |
from md5 import md5 | |
def processPath(container, dirname, filenames): | |
print " Scanning path: " + dirname | |
for name in filenames: | |
full_path = os.path.join(dirname, name) | |
if os.path.isfile(full_path): | |
checksum = md5(open(full_path).read()).hexdigest() | |
if checksum in container: | |
container[checksum] += [full_path] | |
else: | |
container[checksum] = [full_path] | |
def findLocalDuplicates(md5map): | |
result = {} | |
for checksum in md5map: | |
if len(md5map[checksum]) > 1: | |
result[checksum] = md5map[checksum] | |
return result | |
def findForeignDuplicates(target_md5, source_md5): | |
result = {} | |
for checksum in source_md5: | |
if checksum in target_md5: | |
result[checksum] = source_md5[checksum] | |
return result | |
def comparePaths(target, source_list, handler): | |
# Stabilize source_list from all potential valid input types | |
if type(source_list) is str: | |
source_list = [ source_list ] | |
if type(source_list) is not list: | |
source_list = list(source_list) | |
# Check source paths so we don't crash on missing stuff mid-way | |
print "* Checking source and target paths..." | |
for path in source_list + [target]: | |
if not os.path.isdir(path): | |
raise Exception("Path not found: %s" % path) | |
# Good to go - start grabbing checksums from files in | |
target_md5 = {} | |
print "* Scanning files from target path..." | |
os.path.walk(target, processPath, target_md5) | |
# Now go through the sources | |
source_md5 = {} | |
for src_path in source_list: | |
print "* Scanning files from source path: %s..." % src_path | |
os.path.walk(src_path, processPath, source_md5) | |
# Weed out the duplicates now. | |
print "* Processing foreign duplicates..." | |
duplicates = findForeignDuplicates(target_md5, source_md5) | |
# And do something with them... | |
for path_list in duplicates.values(): | |
for path in path_list: | |
handler(path) | |
print "* Done" | |
def handler_print(path, checksum = None): | |
print path | |
return True | |
def handler_delete(path, checksum = None): | |
try: | |
os.unlink(path) | |
except Exception,e: | |
print "ERROR: Cannot delete '%s': %s" % (path, e) | |
return False | |
print "rm " + path | |
return True | |
def main(argv): | |
if len(argv) <= 2: | |
print "Syntax: %s <target_path> <source_path> [source_path] [source_path] ..." % argv[0] | |
return 0 | |
target = argv[1] | |
source = argv[2:] | |
try: | |
comparePaths(target, source, handler_print) | |
except Exception,e: | |
print "FATAL: %s" % e | |
return 1 | |
return 0 | |
### INIT ### | |
if __name__ == "__main__": | |
raise SystemExit(main(argv)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment