Jwink3101/finddupes.py

## finddupes.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Duplicate file finder. Finds dupe files by comparing the following attributes.
Matching size is nessesary but very, very far from sufficient. Still, it is very
fast so we use that to cut out a lot of files.

* size
    * Obviously not at all robust but a nessesary
* CRC checksum
* SHA1 hash

"""
from __future__ import division, print_function, unicode_literals
__version__ = '20200120'

import os
from collections import defaultdict
import hashlib
import itertools
import fnmatch
import zlib

######################### Settings 1
root = '/path/to/root'
excludes = ['.*','*.AAE',] # Note that they are on a per-file basis

# See Settings 2 below but those likely do not need to change
#########################

##### Hashers

def sha256(filepath,BLOCKSIZE=2**20):
    """
    http://pythoncentral.io/hashing-files-with-python/

    2**20: 1 mb
    2**12: 4 kb

    """
    hasher = hashlib.sha256()
    with open(filepath, 'rb') as afile:
        buf = afile.read(BLOCKSIZE)
        while len(buf) > 0:
            hasher.update(buf)
            buf = afile.read(BLOCKSIZE)
    return hasher.hexdigest()

def crc32(filepath,blocksize=2**20):
    """
    Return the crc32 of a file as an 8-byte hex number

    `blocksize` adjusts how much of the file is read into memory at a time.
    This is useful for large files.
        2**20 = 1024 * 1024 = 1 mb
        2**12 = 4 * 1024    = 4 kb
    """
    csum = 0
    with open(filepath, 'rb') as afile:
        buf = afile.read(blocksize)
        while len(buf) > 0:
            csum = zlib.crc32(buf,csum)
            buf = afile.read(blocksize)
    # From the documentation:
    #  > Changed in version 3.0: Always returns an unsigned value.
    #  > To generate the same numeric value across all Python versions and
    #  > platforms, use crc32(data) & 0xffffffff.
    csum = csum & 0xffffffff
    return ('0'*8 + hex(csum)[2:])[-8:] # Preceding 0s

def adler32_1kb(filepath):
    """adler32 of *just* the first 1kb"""
    with open(filepath,'rb') as file:
        csum = zlib.adler32(file.read(1024))
    csum = csum & 0xffffffff
    return ('0'*8 + hex(csum)[2:])[-8:] # Preceding 0s

######################### Settings 2
# Specify test functions
# The tests MUST be in order of severity!!!
tests = [
    ('size',lambda s:os.stat(s).st_size),
    ('Adler32 1kb',adler32_1kb),
#     ('crc32',crc32),
    ('sha256',sha256)
]
#########################

## Get items and size to start
files = []
excludes = [e.lower() for e in excludes]

for dirpath, dirnames, filenames in os.walk(root): # TODO: Settable
    for dirname in dirnames[:]:
        if any(fnmatch.fnmatch(dirname.lower(),e) for e in excludes):
            dirnames.remove(dirname)

    for filename in filenames:
        if filename.startswith('./'):
            filename = filename[2:]

        if any(fnmatch.fnmatch(filename.lower(),e) for e in excludes):
            continue

        filename = os.path.join(dirpath,filename)
        if os.path.islink(filename):
            continue
        files.append(filename)

print(f'Initally found {len(files)} files')

class ArgReturn(object):
    def __init__(self,fun):
        self.fun = fun
    def __call__(self,*args,**kwargs):
        return args,kwargs,self.fun(*args,**kwargs)

udict = None
for test in tests:
    udict = defaultdict(list)
    for (filename,),_,val in map(ArgReturn(test[1]),files):
        udict[val].append(filename)
    _udict = {}
    files = []
    for key,vals in udict.items():
        if len(vals) < 2:
            continue
        _udict[key] = vals
        files.extend(vals)
    udict = _udict
    print(f"Test '{test[0]}' found {len(udict)} unique values with {len(files)} files")


keys = sorted(udict.keys(),key=lambda k:-len(udict[k]))

for key in keys:
    files = udict[key]
    files.sort(key=str.lower)
    print(f'\nFollowing {len(files)} are identical:')
    for ii,file in enumerate(files):
        print(f'    {ii+1}: {file}')
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Duplicate file finder. Finds dupe files by comparing the following attributes.
	Matching size is nessesary but very, very far from sufficient. Still, it is very
	fast so we use that to cut out a lot of files.

	* size
	* Obviously not at all robust but a nessesary
	* CRC checksum
	* SHA1 hash

	"""
	from __future__ import division, print_function, unicode_literals
	__version__ = '20200120'

	import os
	from collections import defaultdict
	import hashlib
	import itertools
	import fnmatch
	import zlib

	######################### Settings 1
	root = '/path/to/root'
	excludes = ['.','.AAE',] # Note that they are on a per-file basis

	# See Settings 2 below but those likely do not need to change
	#########################

	##### Hashers

	def sha256(filepath,BLOCKSIZE=2**20):
	"""
	http://pythoncentral.io/hashing-files-with-python/

	2**20: 1 mb
	2**12: 4 kb

	"""
	hasher = hashlib.sha256()
	with open(filepath, 'rb') as afile:
	buf = afile.read(BLOCKSIZE)
	while len(buf) > 0:
	hasher.update(buf)
	buf = afile.read(BLOCKSIZE)
	return hasher.hexdigest()

	def crc32(filepath,blocksize=2**20):
	"""
	Return the crc32 of a file as an 8-byte hex number

	`blocksize` adjusts how much of the file is read into memory at a time.
	This is useful for large files.
	2*20 = 1024 1024 = 1 mb
	2*12 = 4 1024 = 4 kb
	"""
	csum = 0
	with open(filepath, 'rb') as afile:
	buf = afile.read(blocksize)
	while len(buf) > 0:
	csum = zlib.crc32(buf,csum)
	buf = afile.read(blocksize)
	# From the documentation:
	# > Changed in version 3.0: Always returns an unsigned value.
	# > To generate the same numeric value across all Python versions and
	# > platforms, use crc32(data) & 0xffffffff.
	csum = csum & 0xffffffff
	return ('0'*8 + hex(csum)[2:])[-8:] # Preceding 0s

	def adler32_1kb(filepath):
	"""adler32 of just the first 1kb"""
	with open(filepath,'rb') as file:
	csum = zlib.adler32(file.read(1024))
	csum = csum & 0xffffffff
	return ('0'*8 + hex(csum)[2:])[-8:] # Preceding 0s

	######################### Settings 2
	# Specify test functions
	# The tests MUST be in order of severity!!!
	tests = [
	('size',lambda s:os.stat(s).st_size),
	('Adler32 1kb',adler32_1kb),
	# ('crc32',crc32),
	('sha256',sha256)
	]
	#########################

	## Get items and size to start
	files = []
	excludes = [e.lower() for e in excludes]

	for dirpath, dirnames, filenames in os.walk(root): # TODO: Settable
	for dirname in dirnames[:]:
	if any(fnmatch.fnmatch(dirname.lower(),e) for e in excludes):
	dirnames.remove(dirname)

	for filename in filenames:
	if filename.startswith('./'):
	filename = filename[2:]

	if any(fnmatch.fnmatch(filename.lower(),e) for e in excludes):
	continue

	filename = os.path.join(dirpath,filename)
	if os.path.islink(filename):
	continue
	files.append(filename)

	print(f'Initally found {len(files)} files')

	class ArgReturn(object):
	def __init__(self,fun):
	self.fun = fun
	def __call__(self,args,*kwargs):
	return args,kwargs,self.fun(args,*kwargs)

	udict = None
	for test in tests:
	udict = defaultdict(list)
	for (filename,),_,val in map(ArgReturn(test[1]),files):
	udict[val].append(filename)
	_udict = {}
	files = []
	for key,vals in udict.items():
	if len(vals) < 2:
	continue
	_udict[key] = vals
	files.extend(vals)
	udict = _udict
	print(f"Test '{test[0]}' found {len(udict)} unique values with {len(files)} files")


	keys = sorted(udict.keys(),key=lambda k:-len(udict[k]))

	for key in keys:
	files = udict[key]
	files.sort(key=str.lower)
	print(f'\nFollowing {len(files)} are identical:')
	for ii,file in enumerate(files):
	print(f' {ii+1}: {file}')