Created
November 4, 2017 10:13
-
-
Save arkarkark/42e32ea7c38589092e572e1394f5dd9e to your computer and use it in GitHub Desktop.
Find duplicate files using file size then sha256 hash. Output json results for later processing.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
find photos.photolibrary/Masters -type f -name '*.*' | ./finddupes | |
Will try and find duplicate images in a photos album and output a file for later processing | |
file output will be a json format file | |
``` | |
[ | |
["/path/to/img.png", "path/to/img2.png"], | |
["/path/to/pic.png", "path/to/pic2.png", "path/to/pic4.png] | |
] | |
``` | |
Items in the same sub array are expected to be the same. | |
""" | |
import atexit | |
import fileinput | |
import hashlib | |
import os | |
import sys | |
import json | |
PROGRESS_COUNT = 0 | |
PROGRESS_OPTIONS = list('/-\\|/-\\|') | |
DUPE_COUNT = 0 | |
HASHES = {} | |
FILE_SIZE_HASH = {} | |
DUPES = {} | |
def GetHash(x): | |
if not x in HASHES: | |
hash_object = hashlib.sha256() | |
with open(x, "rb") as file_handle: | |
for chunk in iter(lambda: file_handle.read(4096), b""): | |
hash_object.update(chunk) | |
HASHES[x] = hash_object.hexdigest() | |
return HASHES[x] | |
def Progress(): | |
global PROGRESS_COUNT | |
sys.stdout.write("%s\r" % PROGRESS_OPTIONS[PROGRESS_COUNT % len(PROGRESS_OPTIONS)]) | |
PROGRESS_COUNT += 1 | |
if not PROGRESS_COUNT % 100: | |
sys.stdout.write("\r %s (DUPES: unique: %s total: %s)\r" % (PROGRESS_COUNT, len(DUPES), DUPE_COUNT)) | |
sys.stdout.flush() | |
def SameFileByHash(a, b): | |
hash_value = GetHash(a) | |
if hash_value == GetHash(b): | |
return hash | |
else: | |
return False | |
def CheckFile(a): | |
global DUPE_COUNT | |
size = os.path.getsize(a) | |
found = False | |
if size > 0: | |
if size not in FILE_SIZE_HASH: | |
FILE_SIZE_HASH[size] = [] | |
for file_name in FILE_SIZE_HASH[size]: | |
hash_value = SameFileByHash(a, file_name) | |
if hash_value: | |
if hash_value not in DUPES: | |
DUPES[hash_value] = [file_name, a] | |
else: | |
DUPES[hash_value].append(a) | |
DUPE_COUNT += 1 | |
found = True | |
break | |
if not found: | |
FILE_SIZE_HASH[size].append(a) | |
return found | |
def WriteResults(): | |
json.dump(DUPES.values(), open("dupes.json", "w"), sort_keys=True, indent=4, separators=(',', ': ')) | |
atexit.register(WriteResults) | |
for line in fileinput.input(): | |
Progress() | |
full_path = line.strip() | |
CheckFile(full_path) |
ty so much for this omg
I'm so glad you found it useful. @bahamas10 . I have another script that actually makes an album of the offending images (slightly different criteria).
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
ty so much for this omg