Skip to content

Instantly share code, notes, and snippets.

@briglx
Created October 26, 2018 22:24
Show Gist options
  • Save briglx/97f49b1722cbd663a846357e8d73d728 to your computer and use it in GitHub Desktop.
Save briglx/97f49b1722cbd663a846357e8d73d728 to your computer and use it in GitHub Desktop.
Remove duplicate files
import sys
import os
import hashlib
def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def check_for_duplicates(paths, hash=hashlib.sha1):
print(paths)
hashes = {} # Hash dictionary
todelete = {}
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
hashobj = hash()
for chunk in chunk_reader(open(full_path, 'rb')):
hashobj.update(chunk)
file_id = (hashobj.digest(), os.path.getsize(full_path))
duplicate = hashes.get(file_id, None)
if duplicate:
print("Duplicate found: %s and %s" % (full_path, duplicate))
if "Copy of" in full_path:
todelete[file_id] = full_path
if "Copy of" in duplicate:
todelete[file_id] = duplicate
else:
hashes[file_id] = full_path
print("Delete..")
for key, value in todelete.items():
print("... %s" % value)
os.remove("%s" % value)
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print("Please pass the paths to check as parameters to the script")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment