Skip to content

Instantly share code, notes, and snippets.

@APadierna
Created December 24, 2015 11:34
Show Gist options
  • Save APadierna/a9f77130e0d913f30d76 to your computer and use it in GitHub Desktop.
Save APadierna/a9f77130e0d913f30d76 to your computer and use it in GitHub Desktop.
Script to crawl into a directory and dettect (by hash) duplicated files and (optionally) remove them
#!/usr/bin/env python
"""
Script to crawl into a directory and dettect (by hash) duplicated files and (optionally)
remove them
Kudos to http://stackoverflow.com/a/748908
"""
import argparse
import hashlib
import os
import sys
def main():
parser = argparse.ArgumentParser(description='Recursivelly seek for duplicated files.')
parser.add_argument('--folder',
default='.',
help='Base search folder')
parser.add_argument('--purge',
action='store_const',
const=True,
default=False,
help='Remove duplicated files')
args = parser.parse_args()
check_for_duplicates(args.folder, purge=args.purge)
def check_for_duplicates(paths, hash=hashlib.sha1, purge=False):
hashes = {}
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
hashobj = hash()
for chunk in chunk_reader(open(full_path, 'rb')):
hashobj.update(chunk)
file_id = (hashobj.digest(), os.path.getsize(full_path))
duplicate = hashes.get(file_id, None)
if duplicate:
print("Duplicate found: \n\t%s and \n\t%s" % (full_path, duplicate))
if purge:
print("Removing duplicated file: \n\t%s" % (duplicate))
os.remove(duplicate)
else:
hashes[file_id] = full_path
def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment