Skip to content

Instantly share code, notes, and snippets.

@danriti
Forked from miku/fdups.py
Last active April 1, 2016 11:53
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danriti/5193327 to your computer and use it in GitHub Desktop.
Save danriti/5193327 to your computer and use it in GitHub Desktop.
# http://stackoverflow.com/questions/748675/finding-duplicate-files-and-removing-them/748908#748908
import sys
import os
import mmh3
CHUNK_SIZE = 1024*1024
def check_for_duplicates(paths):
hashes = {}
for path in paths:
for dirpath, _, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
h = mmh3.hash(open(full_path, 'rb').read(CHUNK_SIZE))
file_id = (h, os.path.getsize(full_path))
duplicate = hashes.get(file_id, None)
if duplicate:
print "Duplicate found: %s and %s" % (full_path, duplicate)
else:
hashes[file_id] = full_path
if sys.argv[1:]:
check_for_duplicates(sys.argv[1:])
else:
print "Please pass the paths to check as parameters to the script"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment