nkmathew/fdups.py

## fdups.py
#!/usr/bin/env python

"""
http://stackoverflow.com/questions/748675/finding-duplicate-files-and-removing-them/748908#748908

Yet another duplicate file finder

CHANGELOG:

+ [Thursday] Feb 11, 2016
  - Skip folders with unicode characters in their filenames(keeps causing IOErrors)
  - Fix wrong time description due to argument misplacement
"""

import sys
import os
import hashlib
import collections
from pprint import pprint
import time


def chunk_reader(fobj, chunk_size=1024):
    """ Generator that reads a file in chunks of bytes """
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk


def file_hashes(paths='.', hash_func=hashlib.sha1):
    """ Checksums for all files in the current directory tree

    >>> files_hashes('.')
    {
        ('\xd9@!\xdb9\xe5\xa1\x1d\xbe\x88\x19\xcf!>\xda\xe29fq&', 325L):
            ['.\\pastebin_paster\\.git\\config'],
         ('\xd9B|\xda\t\xab\xa1\xcd\xde\\i\xc2\xb1<\x90[\xdd\xb0\xbcQ', 32L):
            ['.\\pastebin_paster\\.git\\refs\\remotes\\origin\\HEAD'],
         ('\xd9\xa4J|\x91F\n\x01\x14\xb2d(\x96o%Nd\xbev\xcf', 4383L):
            ['.\\tkinter-files\\tt035_py.pyw'],
         ('\xda,9\xdf\xd4\x01\xefp\xf0\x0bK"\xe6#\xd1\x1bD\xf3\x93W', 1448L):
            ['.\\tkinter-files\\labelframe.pyw'],
         ('\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t', 0L):
            ['.\\p30.py', '.\\tkinter-files\\__init__.py']
    }
    """
    if isinstance(paths, str):
        paths = [paths]
    hashes = collections.defaultdict(list)
    for path in paths:
        for dirpath, _, filenames in os.walk(path):
            for filename in filenames:
                full_path = os.path.join(dirpath, filename)
                hashobj = hash_func()
                try:
                    for chunk in chunk_reader(open(full_path, 'rb')):
                        hashobj.update(chunk)
                except IOError:
                    print('IOError on ' + full_path)
                    continue
                file_id = (hashobj.digest(), os.path.getsize(full_path))
                hashes[file_id].append(full_path)
    return dict(hashes)


def file_dupes(fhashes):
    """ Returns list of duplicate files """
    dups = []
    for _, files in fhashes.items():
        if len(files) > 1:
            dups.append(files)
    return dups


def find_dupes(path='.'):
    """ Finds and displays files with the same sizes and checksums

    >>> find_dupes('.')
    [['.\pastebin_paster\.git\logs\HEAD',
      '.\pastebin_paster\.git\logs\refs\heads\master'],
     ['.\pastebin_paster\.git\refs\heads\master',
      '.\pastebin_paster\.git\refs\remotes\origin\master'],
     ['.\p30.py', '.\tkinter-files\__init__.py']]
    Found 3 duplicate files
    """
    hashes = file_hashes(path)
    dupes = file_dupes(hashes)
    for duplst in dupes:
        for path in duplst:
            print(path)
        print('')
    print('Found %d duplicate files' % len(dupes))


def desc_elapsed_time(start_time):
    """
    Describes elapsed time from upto now in English
    """
    elapsed = int(time.time() - start_time)
    hours = elapsed // 3600
    elapsed = elapsed - hours * 3600
    hours, mins, secs = (hours, elapsed // 60, elapsed % 60)
    msg = 'Elapsed time: '
    if hours:
        msg += '{0} Hours {1} Minute(s) {2} seconds'.format(hours, mins, secs)
    elif mins:
        msg += '{0} Minute(s) {1} seconds'.format(mins, secs)
    else:
        msg += '{0} seconds'.format(secs)
    return msg


def main():
    """
    Entry point
    """
    start = time.time()
    if sys.argv[1:]:
        find_dupes(sys.argv[1:])
    else:
        find_dupes('.')
    print(desc_elapsed_time(start))

if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	"""
	http://stackoverflow.com/questions/748675/finding-duplicate-files-and-removing-them/748908#748908

	Yet another duplicate file finder

	CHANGELOG:

	+ [Thursday] Feb 11, 2016
	- Skip folders with unicode characters in their filenames(keeps causing IOErrors)
	- Fix wrong time description due to argument misplacement
	"""

	import sys
	import os
	import hashlib
	import collections
	from pprint import pprint
	import time


	def chunk_reader(fobj, chunk_size=1024):
	""" Generator that reads a file in chunks of bytes """
	while True:
	chunk = fobj.read(chunk_size)
	if not chunk:
	return
	yield chunk


	def file_hashes(paths='.', hash_func=hashlib.sha1):
	""" Checksums for all files in the current directory tree

	>>> files_hashes('.')
	{
	('\xd9@!\xdb9\xe5\xa1\x1d\xbe\x88\x19\xcf!>\xda\xe29fq&', 325L):
	['.\\pastebin_paster\\.git\\config'],
	('\xd9B\|\xda\t\xab\xa1\xcd\xde\\i\xc2\xb1<\x90[\xdd\xb0\xbcQ', 32L):
	['.\\pastebin_paster\\.git\\refs\\remotes\\origin\\HEAD'],
	('\xd9\xa4J\|\x91F\n\x01\x14\xb2d(\x96o%Nd\xbev\xcf', 4383L):
	['.\\tkinter-files\\tt035_py.pyw'],
	('\xda,9\xdf\xd4\x01\xefp\xf0\x0bK"\xe6#\xd1\x1bD\xf3\x93W', 1448L):
	['.\\tkinter-files\\labelframe.pyw'],
	('\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t', 0L):
	['.\\p30.py', '.\\tkinter-files\\__init__.py']
	}
	"""
	if isinstance(paths, str):
	paths = [paths]
	hashes = collections.defaultdict(list)
	for path in paths:
	for dirpath, _, filenames in os.walk(path):
	for filename in filenames:
	full_path = os.path.join(dirpath, filename)
	hashobj = hash_func()
	try:
	for chunk in chunk_reader(open(full_path, 'rb')):
	hashobj.update(chunk)
	except IOError:
	print('IOError on ' + full_path)
	continue
	file_id = (hashobj.digest(), os.path.getsize(full_path))
	hashes[file_id].append(full_path)
	return dict(hashes)


	def file_dupes(fhashes):
	""" Returns list of duplicate files """
	dups = []
	for _, files in fhashes.items():
	if len(files) > 1:
	dups.append(files)
	return dups


	def find_dupes(path='.'):
	""" Finds and displays files with the same sizes and checksums

	>>> find_dupes('.')
	[['.\pastebin_paster\.git\logs\HEAD',
	'.\pastebin_paster\.git\logs\refs\heads\master'],
	['.\pastebin_paster\.git\refs\heads\master',
	'.\pastebin_paster\.git\refs\remotes\origin\master'],
	['.\p30.py', '.\tkinter-files\__init__.py']]
	Found 3 duplicate files
	"""
	hashes = file_hashes(path)
	dupes = file_dupes(hashes)
	for duplst in dupes:
	for path in duplst:
	print(path)
	print('')
	print('Found %d duplicate files' % len(dupes))


	def desc_elapsed_time(start_time):
	"""
	Describes elapsed time from upto now in English
	"""
	elapsed = int(time.time() - start_time)
	hours = elapsed // 3600
	elapsed = elapsed - hours * 3600
	hours, mins, secs = (hours, elapsed // 60, elapsed % 60)
	msg = 'Elapsed time: '
	if hours:
	msg += '{0} Hours {1} Minute(s) {2} seconds'.format(hours, mins, secs)
	elif mins:
	msg += '{0} Minute(s) {1} seconds'.format(mins, secs)
	else:
	msg += '{0} seconds'.format(secs)
	return msg


	def main():
	"""
	Entry point
	"""
	start = time.time()
	if sys.argv[1:]:
	find_dupes(sys.argv[1:])
	else:
	find_dupes('.')
	print(desc_elapsed_time(start))

	if __name__ == '__main__':
	main()