Skip to content

Instantly share code, notes, and snippets.

@theonlypwner
Last active March 20, 2016 01:08
Show Gist options
  • Save theonlypwner/1ca26a5de900dcb5d51b to your computer and use it in GitHub Desktop.
Save theonlypwner/1ca26a5de900dcb5d51b to your computer and use it in GitHub Desktop.
Duplicate File Detection
#!/usr/bin/env python
# Duplicate File Detection
__copyright__ = "Copyright (C) 2016 Victor Zheng"
__licence__ = "GNU GPL v3"
# Based on https://github.com/IanLee1521/utilities/blob/master/utilities/find_duplicates.py
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import codecs
import argparse
import hashlib
import os
import sys
def process(dirs):
filesBySize = {}
print('Finding files...')
for dir in dirs:
processFolderBySize(dir, filesBySize)
print('Detecting duplicates...')
for size in sorted(filesBySize.keys(), reverse=True):
paths = filesBySize[size]
if len(paths) == 1:
continue
filesByHash = processFilesByHash(paths)
# Print results
for hash in filesByHash:
files = filesByHash[hash]
if len(files) == 1:
continue
print('{} ({} B) found {} times'.format(hash, size, len(files)))
for file in files:
print(' {}'.format(fixUnicode(file)))
def processFolderBySize(rootDir, dict_out):
if not os.path.exists(rootDir):
print('Invalid path: {}'.format(fixUnicode(rootDir)))
return
for dir, subdirs, files in os.walk(rootDir, topdown=False):
#print('Scanning {}'.format(fixUnicode(dir)))
if not files:
if not subdirs:
print('Empty: {}'.format(fixUnicode(dir)))
continue
for file in files:
path = os.path.join(dir, file)
if not os.path.exists(path):
print('File disappeared: {}'.format(fixUnicode(path)))
continue
# Add entry
size = os.path.getsize(path)
if size in dict_out:
dict_out[size].append(path)
else:
dict_out[size] = [path]
def processFilesByHash(paths):
hashes = {}
for path in paths:
hash = hashPath(path)
if hash in hashes:
hashes[hash].append(path)
else:
hashes[hash] = [path]
return hashes
def hashPath(path, blockSize=1048576):
hash = hashlib.md5()
with open(path, 'rb') as file:
buf = 'Python has no do-while loop'
while len(buf):
buf = file.read(blockSize)
hash.update(buf)
return hash.hexdigest()
def fixUnicode(s):
return s.encode('ascii', 'replace').decode('ascii')
def main():
parser = argparse.ArgumentParser(description='Find duplicate files')
parser.add_argument(
'dirs', metavar='dir', type=str, nargs='+',
help='Director(y|ies) to check for duplicates',
)
args = parser.parse_args()
process(args.dirs)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment