Skip to content

Instantly share code, notes, and snippets.

@Jwink3101
Created August 12, 2019 15:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Jwink3101/af77e8bdf509caf6162643549f29a6fe to your computer and use it in GitHub Desktop.
Save Jwink3101/af77e8bdf509caf6162643549f29a6fe to your computer and use it in GitHub Desktop.
Super, super, super barebones, barely-tested, feature-less duplicate file finder
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Find duplicate files
Warning: This has not been thouroughly tested and is missing many
features including exlucsions, error handling, parallelism, etc.
This is barebones
Process:
* Walk the directory tree
* Compute the Adler32 checksum. This is faster than SHA1 but less reliable
* Determine duplicates and confirm via sha256
"""
from __future__ import print_function
## Specify this path. It is probably better to be an absolute path
DATAPATH = '/PATH/TO/DIRECTORY'
import os
import zlib
import hashlib
from collections import defaultdict
def adler(filepath,BLOCKSIZE=2**15):
"""
Create an additive adler32 checksum. Faster than sha1.
From the documentation:
> Changed in version 3.0: Always returns an unsigned value.
> To generate the same numeric value across all Python versions and
> platforms, use adler32(data) & 0xffffffff.
"""
csum = 1
with open(filepath, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
csum = zlib.adler32(buf,csum)
buf = afile.read(BLOCKSIZE)
csum = csum & 0xffffffff
return csum
def sha256(filepath,BLOCKSIZE=2**15):
"""
2**20: 1 mb
2**12: 4 kb
"""
hasher = hashlib.sha256()
with open(filepath, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
return hasher.hexdigest()
if __name__ == '__main__':
dups_adler = defaultdict(list)
dups_sha256 = defaultdict(list)
# Find all files and compute the adler checksum (faster than sha256)
for dirpath, dirnames, filenames in os.walk(DATAPATH):
for filename in filenames:
filename = os.path.join(dirpath,filename)
hh = adler(filename)
dups_adler[hh].append(filename)
# Now compute the SHA1 for any duplicate
dups0 = [k for k,v in dups_adler.items() if len(v)>1]
for dup in dups0:
for dd in dups_adler[dup]:
hh = sha256(dd)
dups_sha256[hh].append(dd)
for files in dups_sha256.values():
if len(files) == 1:
continue # Not really a duplicate
print('\nThe following {} files are duplicates'.format(len(files)))
for file in files:
print(' ' + file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment