arkarkark/finddupes.py

## finddupes.py
#!/usr/bin/python

"""
find photos.photolibrary/Masters -type f -name '*.*' | ./finddupes
Will try and find duplicate images in a photos album and output a file for later processing
file output will be a json format file
```
[
  ["/path/to/img.png", "path/to/img2.png"],
  ["/path/to/pic.png", "path/to/pic2.png", "path/to/pic4.png]
]
```
Items in the same sub array are expected to be the same.
"""

import atexit
import fileinput
import hashlib
import os
import sys
import json

PROGRESS_COUNT = 0
PROGRESS_OPTIONS = list('/-\\|/-\\|')
DUPE_COUNT = 0

HASHES = {}
FILE_SIZE_HASH = {}
DUPES = {}

def GetHash(x):
  if not x in HASHES:
    hash_object = hashlib.sha256()
    with open(x, "rb") as file_handle:
      for chunk in iter(lambda: file_handle.read(4096), b""):
        hash_object.update(chunk)
    HASHES[x] = hash_object.hexdigest()
  return HASHES[x]

def Progress():
  global PROGRESS_COUNT
  sys.stdout.write("%s\r" % PROGRESS_OPTIONS[PROGRESS_COUNT % len(PROGRESS_OPTIONS)])
  PROGRESS_COUNT += 1
  if not PROGRESS_COUNT % 100:
    sys.stdout.write("\r  %s (DUPES: unique: %s total: %s)\r" % (PROGRESS_COUNT, len(DUPES), DUPE_COUNT))
  sys.stdout.flush()

def SameFileByHash(a, b):
  hash_value = GetHash(a)
  if hash_value == GetHash(b):
    return hash
  else:
    return False

def CheckFile(a):
  global DUPE_COUNT
  size = os.path.getsize(a)
  found = False
  if size > 0:
    if size not in FILE_SIZE_HASH:
      FILE_SIZE_HASH[size] = []
    for file_name in FILE_SIZE_HASH[size]:
      hash_value = SameFileByHash(a, file_name)
      if hash_value:
        if hash_value not in DUPES:
          DUPES[hash_value] = [file_name, a]
        else:
          DUPES[hash_value].append(a)
        DUPE_COUNT += 1
        found = True
        break
    if not found:
      FILE_SIZE_HASH[size].append(a)
  return found

def WriteResults():
  json.dump(DUPES.values(), open("dupes.json", "w"), sort_keys=True, indent=4, separators=(',', ': '))

atexit.register(WriteResults)

for line in fileinput.input():
  Progress()
  full_path = line.strip()
  CheckFile(full_path)
	#!/usr/bin/python

	"""
	find photos.photolibrary/Masters -type f -name '.' \| ./finddupes
	Will try and find duplicate images in a photos album and output a file for later processing
	file output will be a json format file
	```
	[
	["/path/to/img.png", "path/to/img2.png"],
	["/path/to/pic.png", "path/to/pic2.png", "path/to/pic4.png]
	]
	```
	Items in the same sub array are expected to be the same.
	"""

	import atexit
	import fileinput
	import hashlib
	import os
	import sys
	import json

	PROGRESS_COUNT = 0
	PROGRESS_OPTIONS = list('/-\\\|/-\\\|')
	DUPE_COUNT = 0

	HASHES = {}
	FILE_SIZE_HASH = {}
	DUPES = {}

	def GetHash(x):
	if not x in HASHES:
	hash_object = hashlib.sha256()
	with open(x, "rb") as file_handle:
	for chunk in iter(lambda: file_handle.read(4096), b""):
	hash_object.update(chunk)
	HASHES[x] = hash_object.hexdigest()
	return HASHES[x]

	def Progress():
	global PROGRESS_COUNT
	sys.stdout.write("%s\r" % PROGRESS_OPTIONS[PROGRESS_COUNT % len(PROGRESS_OPTIONS)])
	PROGRESS_COUNT += 1
	if not PROGRESS_COUNT % 100:
	sys.stdout.write("\r %s (DUPES: unique: %s total: %s)\r" % (PROGRESS_COUNT, len(DUPES), DUPE_COUNT))
	sys.stdout.flush()

	def SameFileByHash(a, b):
	hash_value = GetHash(a)
	if hash_value == GetHash(b):
	return hash
	else:
	return False

	def CheckFile(a):
	global DUPE_COUNT
	size = os.path.getsize(a)
	found = False
	if size > 0:
	if size not in FILE_SIZE_HASH:
	FILE_SIZE_HASH[size] = []
	for file_name in FILE_SIZE_HASH[size]:
	hash_value = SameFileByHash(a, file_name)
	if hash_value:
	if hash_value not in DUPES:
	DUPES[hash_value] = [file_name, a]
	else:
	DUPES[hash_value].append(a)
	DUPE_COUNT += 1
	found = True
	break
	if not found:
	FILE_SIZE_HASH[size].append(a)
	return found

	def WriteResults():
	json.dump(DUPES.values(), open("dupes.json", "w"), sort_keys=True, indent=4, separators=(',', ': '))

	atexit.register(WriteResults)

	for line in fileinput.input():
	Progress()
	full_path = line.strip()
	CheckFile(full_path)