Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Find duplicate files using file size then sha256 hash. Output json results for later processing.
#!/usr/bin/python
"""
find photos.photolibrary/Masters -type f -name '*.*' | ./finddupes
Will try and find duplicate images in a photos album and output a file for later processing
file output will be a json format file
```
[
["/path/to/img.png", "path/to/img2.png"],
["/path/to/pic.png", "path/to/pic2.png", "path/to/pic4.png]
]
```
Items in the same sub array are expected to be the same.
"""
import atexit
import fileinput
import hashlib
import os
import sys
import json
PROGRESS_COUNT = 0
PROGRESS_OPTIONS = list('/-\\|/-\\|')
DUPE_COUNT = 0
HASHES = {}
FILE_SIZE_HASH = {}
DUPES = {}
def GetHash(x):
if not x in HASHES:
hash_object = hashlib.sha256()
with open(x, "rb") as file_handle:
for chunk in iter(lambda: file_handle.read(4096), b""):
hash_object.update(chunk)
HASHES[x] = hash_object.hexdigest()
return HASHES[x]
def Progress():
global PROGRESS_COUNT
sys.stdout.write("%s\r" % PROGRESS_OPTIONS[PROGRESS_COUNT % len(PROGRESS_OPTIONS)])
PROGRESS_COUNT += 1
if not PROGRESS_COUNT % 100:
sys.stdout.write("\r %s (DUPES: unique: %s total: %s)\r" % (PROGRESS_COUNT, len(DUPES), DUPE_COUNT))
sys.stdout.flush()
def SameFileByHash(a, b):
hash_value = GetHash(a)
if hash_value == GetHash(b):
return hash
else:
return False
def CheckFile(a):
global DUPE_COUNT
size = os.path.getsize(a)
found = False
if size > 0:
if size not in FILE_SIZE_HASH:
FILE_SIZE_HASH[size] = []
for file_name in FILE_SIZE_HASH[size]:
hash_value = SameFileByHash(a, file_name)
if hash_value:
if hash_value not in DUPES:
DUPES[hash_value] = [file_name, a]
else:
DUPES[hash_value].append(a)
DUPE_COUNT += 1
found = True
break
if not found:
FILE_SIZE_HASH[size].append(a)
return found
def WriteResults():
json.dump(DUPES.values(), open("dupes.json", "w"), sort_keys=True, indent=4, separators=(',', ': '))
atexit.register(WriteResults)
for line in fileinput.input():
Progress()
full_path = line.strip()
CheckFile(full_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.