Skip to content

Instantly share code, notes, and snippets.

@Jwink3101
Last active November 20, 2020 02:19
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Jwink3101/4d0edae5eae762509676913aac050049 to your computer and use it in GitHub Desktop.
Save Jwink3101/4d0edae5eae762509676913aac050049 to your computer and use it in GitHub Desktop.
Really barebones duplicate file finder. Doesn't even have a CLI
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Duplicate file finder. Finds dupe files by comparing the following attributes.
Matching size is nessesary but very, very far from sufficient. Still, it is very
fast so we use that to cut out a lot of files.
* size
* Obviously not at all robust but a nessesary
* CRC checksum
* SHA1 hash
"""
from __future__ import division, print_function, unicode_literals
__version__ = '20200120'
import os
from collections import defaultdict
import hashlib
import itertools
import fnmatch
import zlib
######################### Settings 1
root = '/path/to/root'
excludes = ['.*','*.AAE',] # Note that they are on a per-file basis
# See Settings 2 below but those likely do not need to change
#########################
##### Hashers
def sha256(filepath,BLOCKSIZE=2**20):
"""
http://pythoncentral.io/hashing-files-with-python/
2**20: 1 mb
2**12: 4 kb
"""
hasher = hashlib.sha256()
with open(filepath, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
return hasher.hexdigest()
def crc32(filepath,blocksize=2**20):
"""
Return the crc32 of a file as an 8-byte hex number
`blocksize` adjusts how much of the file is read into memory at a time.
This is useful for large files.
2**20 = 1024 * 1024 = 1 mb
2**12 = 4 * 1024 = 4 kb
"""
csum = 0
with open(filepath, 'rb') as afile:
buf = afile.read(blocksize)
while len(buf) > 0:
csum = zlib.crc32(buf,csum)
buf = afile.read(blocksize)
# From the documentation:
# > Changed in version 3.0: Always returns an unsigned value.
# > To generate the same numeric value across all Python versions and
# > platforms, use crc32(data) & 0xffffffff.
csum = csum & 0xffffffff
return ('0'*8 + hex(csum)[2:])[-8:] # Preceding 0s
def adler32_1kb(filepath):
"""adler32 of *just* the first 1kb"""
with open(filepath,'rb') as file:
csum = zlib.adler32(file.read(1024))
csum = csum & 0xffffffff
return ('0'*8 + hex(csum)[2:])[-8:] # Preceding 0s
######################### Settings 2
# Specify test functions
# The tests MUST be in order of severity!!!
tests = [
('size',lambda s:os.stat(s).st_size),
('Adler32 1kb',adler32_1kb),
# ('crc32',crc32),
('sha256',sha256)
]
#########################
## Get items and size to start
files = []
excludes = [e.lower() for e in excludes]
for dirpath, dirnames, filenames in os.walk(root): # TODO: Settable
for dirname in dirnames[:]:
if any(fnmatch.fnmatch(dirname.lower(),e) for e in excludes):
dirnames.remove(dirname)
for filename in filenames:
if filename.startswith('./'):
filename = filename[2:]
if any(fnmatch.fnmatch(filename.lower(),e) for e in excludes):
continue
filename = os.path.join(dirpath,filename)
if os.path.islink(filename):
continue
files.append(filename)
print(f'Initally found {len(files)} files')
class ArgReturn(object):
def __init__(self,fun):
self.fun = fun
def __call__(self,*args,**kwargs):
return args,kwargs,self.fun(*args,**kwargs)
udict = None
for test in tests:
udict = defaultdict(list)
for (filename,),_,val in map(ArgReturn(test[1]),files):
udict[val].append(filename)
_udict = {}
files = []
for key,vals in udict.items():
if len(vals) < 2:
continue
_udict[key] = vals
files.extend(vals)
udict = _udict
print(f"Test '{test[0]}' found {len(udict)} unique values with {len(files)} files")
keys = sorted(udict.keys(),key=lambda k:-len(udict[k]))
for key in keys:
files = udict[key]
files.sort(key=str.lower)
print(f'\nFollowing {len(files)} are identical:')
for ii,file in enumerate(files):
print(f' {ii+1}: {file}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment