-
-
Save nkmathew/2c21aa3dbb9d69d49549 to your computer and use it in GitHub Desktop.
Find duplicate files in python.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
http://stackoverflow.com/questions/748675/finding-duplicate-files-and-removing-them/748908#748908 | |
Yet another duplicate file finder | |
CHANGELOG: | |
+ [Thursday] Feb 11, 2016 | |
- Skip folders with unicode characters in their filenames(keeps causing IOErrors) | |
- Fix wrong time description due to argument misplacement | |
""" | |
import sys | |
import os | |
import hashlib | |
import collections | |
from pprint import pprint | |
import time | |
def chunk_reader(fobj, chunk_size=1024): | |
""" Generator that reads a file in chunks of bytes """ | |
while True: | |
chunk = fobj.read(chunk_size) | |
if not chunk: | |
return | |
yield chunk | |
def file_hashes(paths='.', hash_func=hashlib.sha1): | |
""" Checksums for all files in the current directory tree | |
>>> files_hashes('.') | |
{ | |
('\xd9@!\xdb9\xe5\xa1\x1d\xbe\x88\x19\xcf!>\xda\xe29fq&', 325L): | |
['.\\pastebin_paster\\.git\\config'], | |
('\xd9B|\xda\t\xab\xa1\xcd\xde\\i\xc2\xb1<\x90[\xdd\xb0\xbcQ', 32L): | |
['.\\pastebin_paster\\.git\\refs\\remotes\\origin\\HEAD'], | |
('\xd9\xa4J|\x91F\n\x01\x14\xb2d(\x96o%Nd\xbev\xcf', 4383L): | |
['.\\tkinter-files\\tt035_py.pyw'], | |
('\xda,9\xdf\xd4\x01\xefp\xf0\x0bK"\xe6#\xd1\x1bD\xf3\x93W', 1448L): | |
['.\\tkinter-files\\labelframe.pyw'], | |
('\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t', 0L): | |
['.\\p30.py', '.\\tkinter-files\\__init__.py'] | |
} | |
""" | |
if isinstance(paths, str): | |
paths = [paths] | |
hashes = collections.defaultdict(list) | |
for path in paths: | |
for dirpath, _, filenames in os.walk(path): | |
for filename in filenames: | |
full_path = os.path.join(dirpath, filename) | |
hashobj = hash_func() | |
try: | |
for chunk in chunk_reader(open(full_path, 'rb')): | |
hashobj.update(chunk) | |
except IOError: | |
print('IOError on ' + full_path) | |
continue | |
file_id = (hashobj.digest(), os.path.getsize(full_path)) | |
hashes[file_id].append(full_path) | |
return dict(hashes) | |
def file_dupes(fhashes): | |
""" Returns list of duplicate files """ | |
dups = [] | |
for _, files in fhashes.items(): | |
if len(files) > 1: | |
dups.append(files) | |
return dups | |
def find_dupes(path='.'): | |
""" Finds and displays files with the same sizes and checksums | |
>>> find_dupes('.') | |
[['.\pastebin_paster\.git\logs\HEAD', | |
'.\pastebin_paster\.git\logs\refs\heads\master'], | |
['.\pastebin_paster\.git\refs\heads\master', | |
'.\pastebin_paster\.git\refs\remotes\origin\master'], | |
['.\p30.py', '.\tkinter-files\__init__.py']] | |
Found 3 duplicate files | |
""" | |
hashes = file_hashes(path) | |
dupes = file_dupes(hashes) | |
for duplst in dupes: | |
for path in duplst: | |
print(path) | |
print('') | |
print('Found %d duplicate files' % len(dupes)) | |
def desc_elapsed_time(start_time): | |
""" | |
Describes elapsed time from upto now in English | |
""" | |
elapsed = int(time.time() - start_time) | |
hours = elapsed // 3600 | |
elapsed = elapsed - hours * 3600 | |
hours, mins, secs = (hours, elapsed // 60, elapsed % 60) | |
msg = 'Elapsed time: ' | |
if hours: | |
msg += '{0} Hours {1} Minute(s) {2} seconds'.format(hours, mins, secs) | |
elif mins: | |
msg += '{0} Minute(s) {1} seconds'.format(mins, secs) | |
else: | |
msg += '{0} seconds'.format(secs) | |
return msg | |
def main(): | |
""" | |
Entry point | |
""" | |
start = time.time() | |
if sys.argv[1:]: | |
find_dupes(sys.argv[1:]) | |
else: | |
find_dupes('.') | |
print(desc_elapsed_time(start)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment