Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Find duplicate files using file size then sha256 hash. Output json results for later processing.
#!/usr/bin/python
"""
find photos.photolibrary/Masters -type f -name '*.*' | ./finddupes
Will try and find duplicate images in a photos album and output a file for later processing
file output will be a json format file
```
[
["/path/to/img.png", "path/to/img2.png"],
["/path/to/pic.png", "path/to/pic2.png", "path/to/pic4.png]
]
```
Items in the same sub array are expected to be the same.
"""
import atexit
import fileinput
import hashlib
import os
import sys
import json
PROGRESS_COUNT = 0
PROGRESS_OPTIONS = list('/-\\|/-\\|')
DUPE_COUNT = 0
HASHES = {}
FILE_SIZE_HASH = {}
DUPES = {}
def GetHash(x):
if not x in HASHES:
hash_object = hashlib.sha256()
with open(x, "rb") as file_handle:
for chunk in iter(lambda: file_handle.read(4096), b""):
hash_object.update(chunk)
HASHES[x] = hash_object.hexdigest()
return HASHES[x]
def Progress():
global PROGRESS_COUNT
sys.stdout.write("%s\r" % PROGRESS_OPTIONS[PROGRESS_COUNT % len(PROGRESS_OPTIONS)])
PROGRESS_COUNT += 1
if not PROGRESS_COUNT % 100:
sys.stdout.write("\r %s (DUPES: unique: %s total: %s)\r" % (PROGRESS_COUNT, len(DUPES), DUPE_COUNT))
sys.stdout.flush()
def SameFileByHash(a, b):
hash_value = GetHash(a)
if hash_value == GetHash(b):
return hash
else:
return False
def CheckFile(a):
global DUPE_COUNT
size = os.path.getsize(a)
found = False
if size > 0:
if size not in FILE_SIZE_HASH:
FILE_SIZE_HASH[size] = []
for file_name in FILE_SIZE_HASH[size]:
hash_value = SameFileByHash(a, file_name)
if hash_value:
if hash_value not in DUPES:
DUPES[hash_value] = [file_name, a]
else:
DUPES[hash_value].append(a)
DUPE_COUNT += 1
found = True
break
if not found:
FILE_SIZE_HASH[size].append(a)
return found
def WriteResults():
json.dump(DUPES.values(), open("dupes.json", "w"), sort_keys=True, indent=4, separators=(',', ': '))
atexit.register(WriteResults)
for line in fileinput.input():
Progress()
full_path = line.strip()
CheckFile(full_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment