Skip to content

Instantly share code, notes, and snippets.

@arkarkark
Created November 4, 2017 10:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arkarkark/42e32ea7c38589092e572e1394f5dd9e to your computer and use it in GitHub Desktop.
Save arkarkark/42e32ea7c38589092e572e1394f5dd9e to your computer and use it in GitHub Desktop.
Find duplicate files using file size then sha256 hash. Output json results for later processing.
#!/usr/bin/python
"""
find photos.photolibrary/Masters -type f -name '*.*' | ./finddupes
Will try and find duplicate images in a photos album and output a file for later processing
file output will be a json format file
```
[
["/path/to/img.png", "path/to/img2.png"],
["/path/to/pic.png", "path/to/pic2.png", "path/to/pic4.png]
]
```
Items in the same sub array are expected to be the same.
"""
import atexit
import fileinput
import hashlib
import os
import sys
import json
PROGRESS_COUNT = 0
PROGRESS_OPTIONS = list('/-\\|/-\\|')
DUPE_COUNT = 0
HASHES = {}
FILE_SIZE_HASH = {}
DUPES = {}
def GetHash(x):
if not x in HASHES:
hash_object = hashlib.sha256()
with open(x, "rb") as file_handle:
for chunk in iter(lambda: file_handle.read(4096), b""):
hash_object.update(chunk)
HASHES[x] = hash_object.hexdigest()
return HASHES[x]
def Progress():
global PROGRESS_COUNT
sys.stdout.write("%s\r" % PROGRESS_OPTIONS[PROGRESS_COUNT % len(PROGRESS_OPTIONS)])
PROGRESS_COUNT += 1
if not PROGRESS_COUNT % 100:
sys.stdout.write("\r %s (DUPES: unique: %s total: %s)\r" % (PROGRESS_COUNT, len(DUPES), DUPE_COUNT))
sys.stdout.flush()
def SameFileByHash(a, b):
hash_value = GetHash(a)
if hash_value == GetHash(b):
return hash
else:
return False
def CheckFile(a):
global DUPE_COUNT
size = os.path.getsize(a)
found = False
if size > 0:
if size not in FILE_SIZE_HASH:
FILE_SIZE_HASH[size] = []
for file_name in FILE_SIZE_HASH[size]:
hash_value = SameFileByHash(a, file_name)
if hash_value:
if hash_value not in DUPES:
DUPES[hash_value] = [file_name, a]
else:
DUPES[hash_value].append(a)
DUPE_COUNT += 1
found = True
break
if not found:
FILE_SIZE_HASH[size].append(a)
return found
def WriteResults():
json.dump(DUPES.values(), open("dupes.json", "w"), sort_keys=True, indent=4, separators=(',', ': '))
atexit.register(WriteResults)
for line in fileinput.input():
Progress()
full_path = line.strip()
CheckFile(full_path)
@bahamas10
Copy link

ty so much for this omg

@arkarkark
Copy link
Author

ty so much for this omg

I'm so glad you found it useful. @bahamas10 . I have another script that actually makes an album of the offending images (slightly different criteria).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment