Created
November 25, 2013 03:12
-
-
Save tpott/7635716 to your computer and use it in GitHub Desktop.
Finds duplicate files in a specified directory. First checks file size, and then the md5.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# find_dup_files.py | |
# Trevor Pottinger | |
# Sun Nov 10 21:55:45 PST 2013 | |
import hashlib | |
import sys | |
import os | |
from os.path import join, getsize | |
def gen_find_dups(rootDir): | |
"""Finds duplicate files inside of rootDir in three steps""" | |
# STEP 1: walk the directory tree, read file size | |
allFiles = [] | |
for root, dirs, files in os.walk(rootDir): | |
for f in files: | |
fullFileName = join(root, f) | |
fileSize = getsize(fullFileName) | |
if fileSize == 0: | |
continue # skip empty files | |
allFiles.append({'name': fullFileName, 'size': fileSize}) | |
# STEP 2: sort by file size, ignore unique sizes, compute md5 | |
someFiles = [] | |
allFiles.sort(key=lambda f: f['size']) | |
for i in range(len(allFiles)): | |
if (i > 0 and allFiles[i-1]['size'] == allFiles[i]['size']) or \ | |
(i+1 < len(allFiles) and allFiles[i]['size'] == allFiles[i+1]['size']): | |
fileStr = open(allFiles[i]['name'], 'r').read() | |
allFiles[i]['md5'] = hashlib.md5(fileStr).hexdigest() | |
someFiles.append(allFiles[i]) | |
else: | |
pass # don't include because the file size is unique | |
# STEP 3: sort by md5, ignore unique hashes, return remaining | |
someFiles.sort(key=lambda f: f['md5']) | |
for i in range(len(someFiles)): | |
if (i > 0 and someFiles[i-1]['md5'] == someFiles[i]['md5']) or \ | |
(i+1 < len(someFiles) and someFiles[i]['md5'] == someFiles[i+1]['md5']): | |
yield someFiles[i] | |
else: | |
pass # do nothing! | |
if __name__ == '__main__': | |
# TODO check sys.argv[1] is a directory | |
for dupFile in gen_find_dups(sys.argv[1]): | |
print dupFile['md5'], dupFile['name'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment