Last active
August 29, 2015 14:24
-
-
Save kuanyui/22aaea4569ce009c3718 to your computer and use it in GitHub Desktop.
Find duplicated files via MD5 checksum and output the list as file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import hashlib, os, sys, math, threading, time, argparse, datetime, sqlite3 | |
# filePath is relative path | |
# list.append() set.add() | |
class Hasher(threading.Thread): | |
def __init__(self, lock, file_walker): | |
threading.Thread.__init__(self) | |
self.lock = lock | |
self.file_walker = file_walker | |
def md5(self, filePath, blocksize=128): | |
m = hashlib.md5() | |
with open(os.path.abspath(filePath), "rb" ) as f: | |
while True: | |
buf = f.read(blocksize) | |
if not buf: | |
break | |
m.update(buf) | |
return m.hexdigest() | |
def run(self): | |
# Create SQLite obj | |
self.db = DB() | |
while True: | |
self.lock.acquire() | |
filePath = self.file_walker.popPath() # 沒了會回傳False | |
self.lock.release() | |
if filePath: | |
#checksum = hashlib.md5(open( | |
# os.path.abspath(os.path.expanduser(filePath)), "rb").read()).hexdigest) | |
checksum, mtime = self.file_walker.getCalculatedChecksumFromDB(filePath) | |
current_mtime = int(os.path.getmtime(filePath)) | |
if current_mtime == mtime: | |
# checksum in DB, and is updated | |
self.lock.acquire() | |
self.file_walker.handleFile(filePath, checksum, current_mtime) | |
self.lock.release() | |
elif checksum: | |
# checksum in DB, but is out-of-date | |
checksum = self.md5(filePath) | |
self.lock.acquire() | |
self.db.update(filePath, checksum, current_mtime) | |
self.file_walker.handleFile(filePath, checksum, current_mtime) | |
self.lock.release() | |
else: | |
# checksum not in DB at all. | |
checksum = self.md5(filePath) | |
self.lock.acquire() | |
self.db.insert(filePath, checksum, current_mtime) | |
self.file_walker.handleFile(filePath, checksum, current_mtime) | |
self.lock.release() | |
else: | |
break | |
# close this thread | |
self.db.close() | |
return 0 | |
class DB: | |
def __init__(self): | |
# [FIXME] 應該把DB放在sys.argv中的路徑下,還是固定路徑${HOME}? | |
# 如果是固定路徑,就很方便可以在DB中存abspath惹。 | |
self.conn = sqlite3.connect(DB_FILE_NAME) | |
self.c = self.conn.cursor() | |
# Check if table already existed | |
self.c.execute('''PRAGMA table_info(file_list)''') | |
if len(self.c.fetchall()) == 0: | |
self.c.execute('''CREATE TABLE file_list ( | |
file_path TEXT, | |
md5sum VARCHAR(32), | |
modified_at INTEGER | |
)''') | |
def insert(self, file_path, md5sum, modified_at): | |
self.c.execute( | |
'''INSERT | |
INTO file_list (file_path, md5sum, modified_at) | |
VALUES (?, ?, ?)''', (file_path, md5sum, modified_at)) | |
self.conn.commit() | |
def update(self, file_path, md5sum, modified_at): | |
self.c.execute( | |
'''UPDATE file_list | |
SET md5sum = ?, | |
modified_at = ? | |
WHERE file_path = ?''', (md5sum, modified_at, file_path)) | |
self.conn.commit() | |
def selectAll(self): | |
#[ (file_path, md5sum), ... ] | |
self.c.execute(''' | |
SELECT file_path, md5sum, modified_at | |
FROM file_list ''') | |
return self.c.fetchall() | |
def close(self): | |
self.conn.close() | |
class FileWalker: | |
def __init__(self, rootdir): | |
self.checkFileExists(OUTPUT_FILE_NAME) | |
# Variables | |
self.allFiles = [] | |
self.duplicatedFiles = [] | |
self.seenChecksums = set() | |
self.index = 0 | |
# DB | |
db = DB() | |
self.checksumsInDB = {path:[md5, mtime] for path, md5, mtime in db.selectAll()} | |
# List all files recursively | |
for root, subFolders, files in os.walk(rootdir): | |
for file in files: | |
if file not in [DB_FILE_NAME, OUTPUT_FILE_NAME]: | |
filePath = os.path.join(root, file) | |
self.allFiles.append(filePath) | |
self.fileAmounts = len(self.allFiles) | |
if self.fileAmounts == 0: | |
print("這個目錄下找不到任何檔案") | |
sys.exit() | |
self.allFiles.sort() | |
self.allFiles.reverse() | |
# Hash & Threading | |
lock = threading.Lock() | |
print("Open {} threads and calculating checksum...".format(THREADS_AMOUNT)) | |
for _ in range(0, THREADS_AMOUNT): | |
Hasher(lock, self).start() | |
time.sleep(0.1) | |
while True: | |
if threading.active_count() == 1: | |
self.printFinal() | |
break | |
else: | |
time.sleep(0.1) | |
def checkFileExists(self, filename): | |
if os.path.exists(filename): | |
if os.stat(filename).st_size > 5: | |
c = input('File "{}" existed, overwrite it? [y/n]'.format(filename)) | |
if len(c) == 0 or c[0] not in "Yy": | |
print("""Abort. You can use: | |
`less '{}'` to see the content of the existed file. | |
`cat '{}' | xargs -d "\\n" rm -r` to delete the paths listed in the file.""".format( | |
filename, filename)) | |
sys.exit() | |
self.__start_at = datetime.datetime.now().replace(microsecond=0) | |
def printFinal(self): | |
print("") | |
print("Calculation completed (cost {} seconds).".format( | |
datetime.datetime.now().replace(microsecond=0) - self.__start_at)) | |
if len(self.duplicatedFiles) == 0: | |
print("沒有發現重複的檔案喔~~~ No duplicated files found!") | |
else: | |
duplicatedFilePaths = [filePath for filePath, checksum in self.duplicatedFiles] | |
with open(OUTPUT_FILE_NAME, 'w') as f: | |
f.writelines( | |
"\n".join([os.path.abspath(os.path.expanduser(p)) for p in duplicatedFilePaths])) | |
if VERBOSE: | |
print("{} duplicated files found:".format(len(duplicatedFilePaths))) | |
print("=" * 68) | |
print("\n".join(duplicatedFilePaths)) | |
print("=" * 68) | |
print() | |
print("This list has been saved in \"{}\"".format(OUTPUT_FILE_NAME)) | |
else: | |
print("{} duplicated files found.".format(len(duplicatedFilePaths))) | |
print() | |
print("Detailed list has been saved in \"{}\"".format(OUTPUT_FILE_NAME)) | |
print("""You can use `cat '{}' | xargs -d "\\n" rm -r` to delete them.""".format(OUTPUT_FILE_NAME)) | |
def popPath(self): | |
# 得到hasher需要的 (relative) path | |
try: | |
filePath = self.allFiles.pop() | |
self.index += 1 | |
return filePath | |
except IndexError: | |
return False | |
def getCalculatedChecksumFromDB(self, filePath): | |
"""return md5, mtime""" | |
data = self.checksumsInDB.get(filePath) | |
if data is None: | |
return None, None | |
return data[0], data[1] | |
def handleFile(self, filePath, checksum, mtime, *, | |
justRecalculated=False, thisIsExactlyDuplicated=False): | |
"""Append filePath and checksum into self.duplicatedFiles. | |
This is not DB's business at all.""" | |
# Add/update self.checksumsInDB | |
if (thisIsExactlyDuplicated or | |
checksum in self.seenChecksums): | |
self.duplicatedFiles.append((filePath, checksum)) | |
else: | |
self.seenChecksums.add(checksum) | |
self.checksumsInDB[filePath] = [checksum, mtime] | |
self.printProgress(filePath, checksum) | |
def printProgress(self, filePath, checksum): | |
print(self.percentage(), | |
checksum, filePath, | |
end=" \r") | |
def percentage(self): | |
return "[{}%] {}/{}".format(math.ceil((((self.index) / self.fileAmounts) * 100)), | |
self.index, | |
self.fileAmounts) | |
def parseArguments(): | |
_parser = argparse.ArgumentParser( | |
prog="find_duplicated_files", | |
description='Find duplicated files via MD5 checksum and output the list as file.', | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
_parser.add_argument("-v", "--verbose", action='store_true', # 'count' is ok, too | |
help='print all duplicated files on screen when finishing.') | |
_parser.add_argument("-o", "--output", nargs="?", metavar="FILE NAME", default='duplicated_files.txt', | |
help='specify output filename') | |
_parser.add_argument("-d", "--db", nargs="?", metavar="DB FILE NAME", default='.checksums.db', | |
help="specify the DB file's name for storing checksum") | |
_parser.add_argument("-t", "--threads", nargs="?", metavar="NUMBER", type=int, default=4, | |
help='specify thread amounts.') | |
_parser.add_argument("path", metavar="DIRECTORY_PATH") | |
return _parser.parse_args() | |
def main(): | |
args = parseArguments() | |
global OUTPUT_FILE_NAME, THREADS_AMOUNT, VERBOSE, DB_FILE_NAME | |
OUTPUT_FILE_NAME = args.output | |
THREADS_AMOUNT = args.threads | |
VERBOSE = args.verbose | |
DB_FILE_NAME = args.db | |
FileWalker(args.path) | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment