Skip to content

Instantly share code, notes, and snippets.

@kuanyui
Last active August 29, 2015 14:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kuanyui/22aaea4569ce009c3718 to your computer and use it in GitHub Desktop.
Save kuanyui/22aaea4569ce009c3718 to your computer and use it in GitHub Desktop.
Find duplicated files via MD5 checksum and output the list as file.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import hashlib, os, sys, math, threading, time, argparse, datetime, sqlite3
# filePath is relative path
# list.append() set.add()
class Hasher(threading.Thread):
def __init__(self, lock, file_walker):
threading.Thread.__init__(self)
self.lock = lock
self.file_walker = file_walker
def md5(self, filePath, blocksize=128):
m = hashlib.md5()
with open(os.path.abspath(filePath), "rb" ) as f:
while True:
buf = f.read(blocksize)
if not buf:
break
m.update(buf)
return m.hexdigest()
def run(self):
# Create SQLite obj
self.db = DB()
while True:
self.lock.acquire()
filePath = self.file_walker.popPath() # 沒了會回傳False
self.lock.release()
if filePath:
#checksum = hashlib.md5(open(
# os.path.abspath(os.path.expanduser(filePath)), "rb").read()).hexdigest)
checksum, mtime = self.file_walker.getCalculatedChecksumFromDB(filePath)
current_mtime = int(os.path.getmtime(filePath))
if current_mtime == mtime:
# checksum in DB, and is updated
self.lock.acquire()
self.file_walker.handleFile(filePath, checksum, current_mtime)
self.lock.release()
elif checksum:
# checksum in DB, but is out-of-date
checksum = self.md5(filePath)
self.lock.acquire()
self.db.update(filePath, checksum, current_mtime)
self.file_walker.handleFile(filePath, checksum, current_mtime)
self.lock.release()
else:
# checksum not in DB at all.
checksum = self.md5(filePath)
self.lock.acquire()
self.db.insert(filePath, checksum, current_mtime)
self.file_walker.handleFile(filePath, checksum, current_mtime)
self.lock.release()
else:
break
# close this thread
self.db.close()
return 0
class DB:
def __init__(self):
# [FIXME] 應該把DB放在sys.argv中的路徑下,還是固定路徑${HOME}?
# 如果是固定路徑,就很方便可以在DB中存abspath惹。
self.conn = sqlite3.connect(DB_FILE_NAME)
self.c = self.conn.cursor()
# Check if table already existed
self.c.execute('''PRAGMA table_info(file_list)''')
if len(self.c.fetchall()) == 0:
self.c.execute('''CREATE TABLE file_list (
file_path TEXT,
md5sum VARCHAR(32),
modified_at INTEGER
)''')
def insert(self, file_path, md5sum, modified_at):
self.c.execute(
'''INSERT
INTO file_list (file_path, md5sum, modified_at)
VALUES (?, ?, ?)''', (file_path, md5sum, modified_at))
self.conn.commit()
def update(self, file_path, md5sum, modified_at):
self.c.execute(
'''UPDATE file_list
SET md5sum = ?,
modified_at = ?
WHERE file_path = ?''', (md5sum, modified_at, file_path))
self.conn.commit()
def selectAll(self):
#[ (file_path, md5sum), ... ]
self.c.execute('''
SELECT file_path, md5sum, modified_at
FROM file_list ''')
return self.c.fetchall()
def close(self):
self.conn.close()
class FileWalker:
def __init__(self, rootdir):
self.checkFileExists(OUTPUT_FILE_NAME)
# Variables
self.allFiles = []
self.duplicatedFiles = []
self.seenChecksums = set()
self.index = 0
# DB
db = DB()
self.checksumsInDB = {path:[md5, mtime] for path, md5, mtime in db.selectAll()}
# List all files recursively
for root, subFolders, files in os.walk(rootdir):
for file in files:
if file not in [DB_FILE_NAME, OUTPUT_FILE_NAME]:
filePath = os.path.join(root, file)
self.allFiles.append(filePath)
self.fileAmounts = len(self.allFiles)
if self.fileAmounts == 0:
print("這個目錄下找不到任何檔案")
sys.exit()
self.allFiles.sort()
self.allFiles.reverse()
# Hash & Threading
lock = threading.Lock()
print("Open {} threads and calculating checksum...".format(THREADS_AMOUNT))
for _ in range(0, THREADS_AMOUNT):
Hasher(lock, self).start()
time.sleep(0.1)
while True:
if threading.active_count() == 1:
self.printFinal()
break
else:
time.sleep(0.1)
def checkFileExists(self, filename):
if os.path.exists(filename):
if os.stat(filename).st_size > 5:
c = input('File "{}" existed, overwrite it? [y/n]'.format(filename))
if len(c) == 0 or c[0] not in "Yy":
print("""Abort. You can use:
`less '{}'` to see the content of the existed file.
`cat '{}' | xargs -d "\\n" rm -r` to delete the paths listed in the file.""".format(
filename, filename))
sys.exit()
self.__start_at = datetime.datetime.now().replace(microsecond=0)
def printFinal(self):
print("")
print("Calculation completed (cost {} seconds).".format(
datetime.datetime.now().replace(microsecond=0) - self.__start_at))
if len(self.duplicatedFiles) == 0:
print("沒有發現重複的檔案喔~~~ No duplicated files found!")
else:
duplicatedFilePaths = [filePath for filePath, checksum in self.duplicatedFiles]
with open(OUTPUT_FILE_NAME, 'w') as f:
f.writelines(
"\n".join([os.path.abspath(os.path.expanduser(p)) for p in duplicatedFilePaths]))
if VERBOSE:
print("{} duplicated files found:".format(len(duplicatedFilePaths)))
print("=" * 68)
print("\n".join(duplicatedFilePaths))
print("=" * 68)
print()
print("This list has been saved in \"{}\"".format(OUTPUT_FILE_NAME))
else:
print("{} duplicated files found.".format(len(duplicatedFilePaths)))
print()
print("Detailed list has been saved in \"{}\"".format(OUTPUT_FILE_NAME))
print("""You can use `cat '{}' | xargs -d "\\n" rm -r` to delete them.""".format(OUTPUT_FILE_NAME))
def popPath(self):
# 得到hasher需要的 (relative) path
try:
filePath = self.allFiles.pop()
self.index += 1
return filePath
except IndexError:
return False
def getCalculatedChecksumFromDB(self, filePath):
"""return md5, mtime"""
data = self.checksumsInDB.get(filePath)
if data is None:
return None, None
return data[0], data[1]
def handleFile(self, filePath, checksum, mtime, *,
justRecalculated=False, thisIsExactlyDuplicated=False):
"""Append filePath and checksum into self.duplicatedFiles.
This is not DB's business at all."""
# Add/update self.checksumsInDB
if (thisIsExactlyDuplicated or
checksum in self.seenChecksums):
self.duplicatedFiles.append((filePath, checksum))
else:
self.seenChecksums.add(checksum)
self.checksumsInDB[filePath] = [checksum, mtime]
self.printProgress(filePath, checksum)
def printProgress(self, filePath, checksum):
print(self.percentage(),
checksum, filePath,
end=" \r")
def percentage(self):
return "[{}%] {}/{}".format(math.ceil((((self.index) / self.fileAmounts) * 100)),
self.index,
self.fileAmounts)
def parseArguments():
_parser = argparse.ArgumentParser(
prog="find_duplicated_files",
description='Find duplicated files via MD5 checksum and output the list as file.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
_parser.add_argument("-v", "--verbose", action='store_true', # 'count' is ok, too
help='print all duplicated files on screen when finishing.')
_parser.add_argument("-o", "--output", nargs="?", metavar="FILE NAME", default='duplicated_files.txt',
help='specify output filename')
_parser.add_argument("-d", "--db", nargs="?", metavar="DB FILE NAME", default='.checksums.db',
help="specify the DB file's name for storing checksum")
_parser.add_argument("-t", "--threads", nargs="?", metavar="NUMBER", type=int, default=4,
help='specify thread amounts.')
_parser.add_argument("path", metavar="DIRECTORY_PATH")
return _parser.parse_args()
def main():
args = parseArguments()
global OUTPUT_FILE_NAME, THREADS_AMOUNT, VERBOSE, DB_FILE_NAME
OUTPUT_FILE_NAME = args.output
THREADS_AMOUNT = args.threads
VERBOSE = args.verbose
DB_FILE_NAME = args.db
FileWalker(args.path)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment