Skip to content

Instantly share code, notes, and snippets.

@tpott
Created November 25, 2013 03:12
Show Gist options
  • Save tpott/7635716 to your computer and use it in GitHub Desktop.
Save tpott/7635716 to your computer and use it in GitHub Desktop.
Finds duplicate files in a specified directory. First checks file size, and then the md5.
#! /usr/bin/env python
# find_dup_files.py
# Trevor Pottinger
# Sun Nov 10 21:55:45 PST 2013
import hashlib
import sys
import os
from os.path import join, getsize
def gen_find_dups(rootDir):
"""Finds duplicate files inside of rootDir in three steps"""
# STEP 1: walk the directory tree, read file size
allFiles = []
for root, dirs, files in os.walk(rootDir):
for f in files:
fullFileName = join(root, f)
fileSize = getsize(fullFileName)
if fileSize == 0:
continue # skip empty files
allFiles.append({'name': fullFileName, 'size': fileSize})
# STEP 2: sort by file size, ignore unique sizes, compute md5
someFiles = []
allFiles.sort(key=lambda f: f['size'])
for i in range(len(allFiles)):
if (i > 0 and allFiles[i-1]['size'] == allFiles[i]['size']) or \
(i+1 < len(allFiles) and allFiles[i]['size'] == allFiles[i+1]['size']):
fileStr = open(allFiles[i]['name'], 'r').read()
allFiles[i]['md5'] = hashlib.md5(fileStr).hexdigest()
someFiles.append(allFiles[i])
else:
pass # don't include because the file size is unique
# STEP 3: sort by md5, ignore unique hashes, return remaining
someFiles.sort(key=lambda f: f['md5'])
for i in range(len(someFiles)):
if (i > 0 and someFiles[i-1]['md5'] == someFiles[i]['md5']) or \
(i+1 < len(someFiles) and someFiles[i]['md5'] == someFiles[i+1]['md5']):
yield someFiles[i]
else:
pass # do nothing!
if __name__ == '__main__':
# TODO check sys.argv[1] is a directory
for dupFile in gen_find_dups(sys.argv[1]):
print dupFile['md5'], dupFile['name']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment