Skip to content

Instantly share code, notes, and snippets.

@nkmathew
Forked from miku/fdups.py
Last active April 28, 2016 12:09
Show Gist options
  • Save nkmathew/2c21aa3dbb9d69d49549 to your computer and use it in GitHub Desktop.
Save nkmathew/2c21aa3dbb9d69d49549 to your computer and use it in GitHub Desktop.
Find duplicate files in python.
#!/usr/bin/env python
"""
http://stackoverflow.com/questions/748675/finding-duplicate-files-and-removing-them/748908#748908
Yet another duplicate file finder
CHANGELOG:
+ [Thursday] Feb 11, 2016
- Skip folders with unicode characters in their filenames(keeps causing IOErrors)
- Fix wrong time description due to argument misplacement
"""
import sys
import os
import hashlib
import collections
from pprint import pprint
import time
def chunk_reader(fobj, chunk_size=1024):
""" Generator that reads a file in chunks of bytes """
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def file_hashes(paths='.', hash_func=hashlib.sha1):
""" Checksums for all files in the current directory tree
>>> files_hashes('.')
{
('\xd9@!\xdb9\xe5\xa1\x1d\xbe\x88\x19\xcf!>\xda\xe29fq&', 325L):
['.\\pastebin_paster\\.git\\config'],
('\xd9B|\xda\t\xab\xa1\xcd\xde\\i\xc2\xb1<\x90[\xdd\xb0\xbcQ', 32L):
['.\\pastebin_paster\\.git\\refs\\remotes\\origin\\HEAD'],
('\xd9\xa4J|\x91F\n\x01\x14\xb2d(\x96o%Nd\xbev\xcf', 4383L):
['.\\tkinter-files\\tt035_py.pyw'],
('\xda,9\xdf\xd4\x01\xefp\xf0\x0bK"\xe6#\xd1\x1bD\xf3\x93W', 1448L):
['.\\tkinter-files\\labelframe.pyw'],
('\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t', 0L):
['.\\p30.py', '.\\tkinter-files\\__init__.py']
}
"""
if isinstance(paths, str):
paths = [paths]
hashes = collections.defaultdict(list)
for path in paths:
for dirpath, _, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
hashobj = hash_func()
try:
for chunk in chunk_reader(open(full_path, 'rb')):
hashobj.update(chunk)
except IOError:
print('IOError on ' + full_path)
continue
file_id = (hashobj.digest(), os.path.getsize(full_path))
hashes[file_id].append(full_path)
return dict(hashes)
def file_dupes(fhashes):
""" Returns list of duplicate files """
dups = []
for _, files in fhashes.items():
if len(files) > 1:
dups.append(files)
return dups
def find_dupes(path='.'):
""" Finds and displays files with the same sizes and checksums
>>> find_dupes('.')
[['.\pastebin_paster\.git\logs\HEAD',
'.\pastebin_paster\.git\logs\refs\heads\master'],
['.\pastebin_paster\.git\refs\heads\master',
'.\pastebin_paster\.git\refs\remotes\origin\master'],
['.\p30.py', '.\tkinter-files\__init__.py']]
Found 3 duplicate files
"""
hashes = file_hashes(path)
dupes = file_dupes(hashes)
for duplst in dupes:
for path in duplst:
print(path)
print('')
print('Found %d duplicate files' % len(dupes))
def desc_elapsed_time(start_time):
"""
Describes elapsed time from upto now in English
"""
elapsed = int(time.time() - start_time)
hours = elapsed // 3600
elapsed = elapsed - hours * 3600
hours, mins, secs = (hours, elapsed // 60, elapsed % 60)
msg = 'Elapsed time: '
if hours:
msg += '{0} Hours {1} Minute(s) {2} seconds'.format(hours, mins, secs)
elif mins:
msg += '{0} Minute(s) {1} seconds'.format(mins, secs)
else:
msg += '{0} seconds'.format(secs)
return msg
def main():
"""
Entry point
"""
start = time.time()
if sys.argv[1:]:
find_dupes(sys.argv[1:])
else:
find_dupes('.')
print(desc_elapsed_time(start))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment