Jwink3101/barebone_dup_finder.py

## barebone_dup_finder.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Find duplicate files

Warning: This has not been thouroughly tested and is missing many
features including exlucsions, error handling, parallelism, etc.
This is barebones

Process:

* Walk the directory tree
* Compute the Adler32 checksum. This is faster than SHA1 but less reliable
* Determine duplicates and confirm via sha256
"""
from __future__ import print_function

## Specify this path. It is probably better to be an absolute path
DATAPATH =  '/PATH/TO/DIRECTORY'

import os
import zlib
import hashlib
from collections import defaultdict


def adler(filepath,BLOCKSIZE=2**15):
    """
    Create an additive adler32 checksum. Faster than sha1.

    From the documentation:
     > Changed in version 3.0: Always returns an unsigned value.
     > To generate the same numeric value across all Python versions and
     > platforms, use adler32(data) & 0xffffffff.
    """
    csum = 1
    with open(filepath, 'rb') as afile:
        buf = afile.read(BLOCKSIZE)
        while len(buf) > 0:
            csum = zlib.adler32(buf,csum)
            buf = afile.read(BLOCKSIZE)
    csum = csum & 0xffffffff
    return csum

def sha256(filepath,BLOCKSIZE=2**15):
    """
    2**20: 1 mb
    2**12: 4 kb
    """
    hasher = hashlib.sha256()
    with open(filepath, 'rb') as afile:
        buf = afile.read(BLOCKSIZE)
        while len(buf) > 0:
            hasher.update(buf)
            buf = afile.read(BLOCKSIZE)
    return hasher.hexdigest()


if __name__ == '__main__':
    dups_adler = defaultdict(list)
    dups_sha256 = defaultdict(list)
    # Find all files and compute the adler checksum (faster than sha256)
    for dirpath, dirnames, filenames in os.walk(DATAPATH):
        for filename in filenames:
            filename = os.path.join(dirpath,filename)
            hh = adler(filename)
            dups_adler[hh].append(filename)

    # Now compute the SHA1 for any duplicate
    dups0 = [k for k,v in dups_adler.items() if len(v)>1]
    for dup in dups0:
        for dd in dups_adler[dup]:
            hh = sha256(dd)
            dups_sha256[hh].append(dd)

    for files in dups_sha256.values():
        if len(files) == 1:
            continue # Not really a duplicate
        print('\nThe following {} files are duplicates'.format(len(files)))
        for file in files:
            print('   ' + file)
	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""
	Find duplicate files

	Warning: This has not been thouroughly tested and is missing many
	features including exlucsions, error handling, parallelism, etc.
	This is barebones

	Process:

	* Walk the directory tree
	* Compute the Adler32 checksum. This is faster than SHA1 but less reliable
	* Determine duplicates and confirm via sha256
	"""
	from __future__ import print_function

	## Specify this path. It is probably better to be an absolute path
	DATAPATH = '/PATH/TO/DIRECTORY'

	import os
	import zlib
	import hashlib
	from collections import defaultdict


	def adler(filepath,BLOCKSIZE=2**15):
	"""
	Create an additive adler32 checksum. Faster than sha1.

	From the documentation:
	> Changed in version 3.0: Always returns an unsigned value.
	> To generate the same numeric value across all Python versions and
	> platforms, use adler32(data) & 0xffffffff.
	"""
	csum = 1
	with open(filepath, 'rb') as afile:
	buf = afile.read(BLOCKSIZE)
	while len(buf) > 0:
	csum = zlib.adler32(buf,csum)
	buf = afile.read(BLOCKSIZE)
	csum = csum & 0xffffffff
	return csum

	def sha256(filepath,BLOCKSIZE=2**15):
	"""
	2**20: 1 mb
	2**12: 4 kb
	"""
	hasher = hashlib.sha256()
	with open(filepath, 'rb') as afile:
	buf = afile.read(BLOCKSIZE)
	while len(buf) > 0:
	hasher.update(buf)
	buf = afile.read(BLOCKSIZE)
	return hasher.hexdigest()



	if __name__ == '__main__':
	dups_adler = defaultdict(list)
	dups_sha256 = defaultdict(list)
	# Find all files and compute the adler checksum (faster than sha256)
	for dirpath, dirnames, filenames in os.walk(DATAPATH):
	for filename in filenames:
	filename = os.path.join(dirpath,filename)
	hh = adler(filename)
	dups_adler[hh].append(filename)

	# Now compute the SHA1 for any duplicate
	dups0 = [k for k,v in dups_adler.items() if len(v)>1]
	for dup in dups0:
	for dd in dups_adler[dup]:
	hh = sha256(dd)
	dups_sha256[hh].append(dd)

	for files in dups_sha256.values():
	if len(files) == 1:
	continue # Not really a duplicate
	print('\nThe following {} files are duplicates'.format(len(files)))
	for file in files:
	print(' ' + file)