kuanyui/find_duplicated_files.py

## find_duplicated_files.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import hashlib, os, sys, math, threading, time, argparse, datetime, sqlite3

# filePath is relative path

# list.append()  set.add()
class Hasher(threading.Thread):
    def __init__(self, lock, file_walker):
        threading.Thread.__init__(self)
        self.lock = lock
        self.file_walker = file_walker


    def md5(self, filePath, blocksize=128):
        m = hashlib.md5()
        with open(os.path.abspath(filePath), "rb" ) as f:
            while True:
                buf = f.read(blocksize)
                if not buf:
                    break
                m.update(buf)
        return m.hexdigest()

    def run(self):
        # Create SQLite obj
        self.db = DB()

        while True:
            self.lock.acquire()
            filePath = self.file_walker.popPath() # 沒了會回傳False
            self.lock.release()

            if filePath:
                #checksum = hashlib.md5(open(
                #    os.path.abspath(os.path.expanduser(filePath)), "rb").read()).hexdigest)
                checksum, mtime = self.file_walker.getCalculatedChecksumFromDB(filePath)
                current_mtime = int(os.path.getmtime(filePath))

                if current_mtime == mtime:
                    # checksum in DB, and is updated
                    self.lock.acquire()
                    self.file_walker.handleFile(filePath, checksum, current_mtime)
                    self.lock.release()

                elif checksum:
                    # checksum in DB, but is out-of-date
                    checksum = self.md5(filePath)
                    self.lock.acquire()
                    self.db.update(filePath, checksum, current_mtime)
                    self.file_walker.handleFile(filePath, checksum, current_mtime)
                    self.lock.release()

                else:
                    # checksum not in DB at all.
                    checksum = self.md5(filePath)
                    self.lock.acquire()
                    self.db.insert(filePath, checksum, current_mtime)
                    self.file_walker.handleFile(filePath, checksum, current_mtime)
                    self.lock.release()
            else:
                break
        # close this thread
        self.db.close()
        return 0


class DB:
    def __init__(self):
        # [FIXME] 應該把DB放在sys.argv中的路徑下，還是固定路徑${HOME}?
        # 如果是固定路徑，就很方便可以在DB中存abspath惹。
        self.conn = sqlite3.connect(DB_FILE_NAME)
        self.c = self.conn.cursor()
        # Check if table already existed
        self.c.execute('''PRAGMA table_info(file_list)''')
        if len(self.c.fetchall()) == 0:
            self.c.execute('''CREATE TABLE file_list (
            file_path   TEXT,
            md5sum      VARCHAR(32),
            modified_at INTEGER
            )''')

    def insert(self, file_path, md5sum, modified_at):
        self.c.execute(
            '''INSERT
            INTO file_list (file_path, md5sum, modified_at)
            VALUES (?, ?, ?)''', (file_path, md5sum, modified_at))
        self.conn.commit()

    def update(self, file_path, md5sum, modified_at):
        self.c.execute(
            '''UPDATE file_list
            SET md5sum = ?,
                modified_at = ?
            WHERE file_path = ?''', (md5sum, modified_at, file_path))
        self.conn.commit()

    def selectAll(self):
        #[ (file_path, md5sum), ... ]
        self.c.execute('''
        SELECT file_path, md5sum, modified_at
        FROM file_list ''')
        return self.c.fetchall()

    def close(self):
        self.conn.close()

class FileWalker:
    def __init__(self, rootdir):
        self.checkFileExists(OUTPUT_FILE_NAME)
        # Variables
        self.allFiles = []
        self.duplicatedFiles = []
        self.seenChecksums = set()
        self.index = 0

        # DB
        db = DB()
        self.checksumsInDB = {path:[md5, mtime] for path, md5, mtime in db.selectAll()}

        # List all files recursively
        for root, subFolders, files in os.walk(rootdir):
            for file in files:
                if file not in [DB_FILE_NAME, OUTPUT_FILE_NAME]:
                    filePath = os.path.join(root, file)
                    self.allFiles.append(filePath)

        self.fileAmounts = len(self.allFiles)
        if self.fileAmounts == 0:
            print("這個目錄下找不到任何檔案")
            sys.exit()
        self.allFiles.sort()
        self.allFiles.reverse()

        # Hash & Threading
        lock = threading.Lock()
        print("Open {} threads and calculating checksum...".format(THREADS_AMOUNT))
        for _ in range(0, THREADS_AMOUNT):
            Hasher(lock, self).start()

        time.sleep(0.1)
        while True:
            if threading.active_count() == 1:
                self.printFinal()
                break
            else:
                time.sleep(0.1)


    def checkFileExists(self, filename):
        if os.path.exists(filename):
            if os.stat(filename).st_size > 5:
                c = input('File "{}" existed, overwrite it? [y/n]'.format(filename))
                if len(c) == 0 or c[0] not in "Yy":
                    print("""Abort. You can use:
`less '{}'` to see the content of the existed file.
`cat '{}' | xargs -d "\\n" rm -r` to delete the paths listed in the file.""".format(
    filename, filename))
                    sys.exit()

        self.__start_at = datetime.datetime.now().replace(microsecond=0)

    def printFinal(self):
        print("")
        print("Calculation completed (cost {} seconds).".format(
            datetime.datetime.now().replace(microsecond=0) - self.__start_at))

        if len(self.duplicatedFiles) == 0:
            print("沒有發現重複的檔案喔～～～ No duplicated files found!")
        else:
            duplicatedFilePaths = [filePath for filePath, checksum in self.duplicatedFiles]
            with open(OUTPUT_FILE_NAME, 'w') as f:
                f.writelines(
                    "\n".join([os.path.abspath(os.path.expanduser(p)) for p in duplicatedFilePaths]))

            if VERBOSE:
                print("{} duplicated files found:".format(len(duplicatedFilePaths)))
                print("=" * 68)
                print("\n".join(duplicatedFilePaths))
                print("=" * 68)
                print()
                print("This list has been saved in \"{}\"".format(OUTPUT_FILE_NAME))
            else:
                print("{} duplicated files found.".format(len(duplicatedFilePaths)))
                print()
                print("Detailed list has been saved in \"{}\"".format(OUTPUT_FILE_NAME))
            print("""You can use `cat '{}' | xargs -d "\\n" rm -r` to delete them.""".format(OUTPUT_FILE_NAME))


    def popPath(self):
        # 得到hasher需要的 (relative) path
        try:
            filePath = self.allFiles.pop()
            self.index += 1
            return filePath
        except IndexError:
            return False

    def getCalculatedChecksumFromDB(self, filePath):
        """return md5, mtime"""
        data = self.checksumsInDB.get(filePath)
        if data is None:
            return None, None
        return data[0], data[1]

    def handleFile(self, filePath, checksum, mtime, *,
                   justRecalculated=False, thisIsExactlyDuplicated=False):
        """Append filePath and checksum into self.duplicatedFiles.
        This is not DB's business at all."""
        # Add/update self.checksumsInDB
        if (thisIsExactlyDuplicated or
            checksum in self.seenChecksums):
            self.duplicatedFiles.append((filePath, checksum))
        else:
            self.seenChecksums.add(checksum)

        self.checksumsInDB[filePath] = [checksum, mtime]
        self.printProgress(filePath, checksum)


    def printProgress(self, filePath, checksum):
        print(self.percentage(),
              checksum, filePath,
              end="                                 \r")


    def percentage(self):
        return "[{}%] {}/{}".format(math.ceil((((self.index) / self.fileAmounts) * 100)),
                                     self.index,
                                     self.fileAmounts)

def parseArguments():
    _parser = argparse.ArgumentParser(
        prog="find_duplicated_files",
        description='Find duplicated files via MD5 checksum and output the list as file.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    _parser.add_argument("-v", "--verbose", action='store_true', # 'count' is ok, too
                         help='print all duplicated files on screen when finishing.')
    _parser.add_argument("-o", "--output", nargs="?", metavar="FILE NAME", default='duplicated_files.txt',
                         help='specify output filename')
    _parser.add_argument("-d", "--db", nargs="?", metavar="DB FILE NAME", default='.checksums.db',
                         help="specify the DB file's name for storing checksum")
    _parser.add_argument("-t", "--threads", nargs="?", metavar="NUMBER", type=int, default=4,
                         help='specify thread amounts.')
    _parser.add_argument("path", metavar="DIRECTORY_PATH")
    return _parser.parse_args()


def main():
    args = parseArguments()
    global OUTPUT_FILE_NAME, THREADS_AMOUNT, VERBOSE, DB_FILE_NAME
    OUTPUT_FILE_NAME = args.output
    THREADS_AMOUNT = args.threads
    VERBOSE = args.verbose
    DB_FILE_NAME = args.db

    FileWalker(args.path)


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	import hashlib, os, sys, math, threading, time, argparse, datetime, sqlite3

	# filePath is relative path

	# list.append() set.add()
	class Hasher(threading.Thread):
	def __init__(self, lock, file_walker):
	threading.Thread.__init__(self)
	self.lock = lock
	self.file_walker = file_walker


	def md5(self, filePath, blocksize=128):
	m = hashlib.md5()
	with open(os.path.abspath(filePath), "rb" ) as f:
	while True:
	buf = f.read(blocksize)
	if not buf:
	break
	m.update(buf)
	return m.hexdigest()

	def run(self):
	# Create SQLite obj
	self.db = DB()

	while True:
	self.lock.acquire()
	filePath = self.file_walker.popPath() # 沒了會回傳False
	self.lock.release()

	if filePath:
	#checksum = hashlib.md5(open(
	# os.path.abspath(os.path.expanduser(filePath)), "rb").read()).hexdigest)
	checksum, mtime = self.file_walker.getCalculatedChecksumFromDB(filePath)
	current_mtime = int(os.path.getmtime(filePath))

	if current_mtime == mtime:
	# checksum in DB, and is updated
	self.lock.acquire()
	self.file_walker.handleFile(filePath, checksum, current_mtime)
	self.lock.release()

	elif checksum:
	# checksum in DB, but is out-of-date
	checksum = self.md5(filePath)
	self.lock.acquire()
	self.db.update(filePath, checksum, current_mtime)
	self.file_walker.handleFile(filePath, checksum, current_mtime)
	self.lock.release()

	else:
	# checksum not in DB at all.
	checksum = self.md5(filePath)
	self.lock.acquire()
	self.db.insert(filePath, checksum, current_mtime)
	self.file_walker.handleFile(filePath, checksum, current_mtime)
	self.lock.release()
	else:
	break
	# close this thread
	self.db.close()
	return 0


	class DB:
	def __init__(self):
	# [FIXME] 應該把DB放在sys.argv中的路徑下，還是固定路徑${HOME}?
	# 如果是固定路徑，就很方便可以在DB中存abspath惹。
	self.conn = sqlite3.connect(DB_FILE_NAME)
	self.c = self.conn.cursor()
	# Check if table already existed
	self.c.execute('''PRAGMA table_info(file_list)''')
	if len(self.c.fetchall()) == 0:
	self.c.execute('''CREATE TABLE file_list (
	file_path TEXT,
	md5sum VARCHAR(32),
	modified_at INTEGER
	)''')

	def insert(self, file_path, md5sum, modified_at):
	self.c.execute(
	'''INSERT
	INTO file_list (file_path, md5sum, modified_at)
	VALUES (?, ?, ?)''', (file_path, md5sum, modified_at))
	self.conn.commit()

	def update(self, file_path, md5sum, modified_at):
	self.c.execute(
	'''UPDATE file_list
	SET md5sum = ?,
	modified_at = ?
	WHERE file_path = ?''', (md5sum, modified_at, file_path))
	self.conn.commit()

	def selectAll(self):
	#[ (file_path, md5sum), ... ]
	self.c.execute('''
	SELECT file_path, md5sum, modified_at
	FROM file_list ''')
	return self.c.fetchall()

	def close(self):
	self.conn.close()

	class FileWalker:
	def __init__(self, rootdir):
	self.checkFileExists(OUTPUT_FILE_NAME)
	# Variables
	self.allFiles = []
	self.duplicatedFiles = []
	self.seenChecksums = set()
	self.index = 0

	# DB
	db = DB()
	self.checksumsInDB = {path:[md5, mtime] for path, md5, mtime in db.selectAll()}

	# List all files recursively
	for root, subFolders, files in os.walk(rootdir):
	for file in files:
	if file not in [DB_FILE_NAME, OUTPUT_FILE_NAME]:
	filePath = os.path.join(root, file)
	self.allFiles.append(filePath)

	self.fileAmounts = len(self.allFiles)
	if self.fileAmounts == 0:
	print("這個目錄下找不到任何檔案")
	sys.exit()
	self.allFiles.sort()
	self.allFiles.reverse()

	# Hash & Threading
	lock = threading.Lock()
	print("Open {} threads and calculating checksum...".format(THREADS_AMOUNT))
	for _ in range(0, THREADS_AMOUNT):
	Hasher(lock, self).start()

	time.sleep(0.1)
	while True:
	if threading.active_count() == 1:
	self.printFinal()
	break
	else:
	time.sleep(0.1)



	def checkFileExists(self, filename):
	if os.path.exists(filename):
	if os.stat(filename).st_size > 5:
	c = input('File "{}" existed, overwrite it? [y/n]'.format(filename))
	if len(c) == 0 or c[0] not in "Yy":
	print("""Abort. You can use:
	`less '{}'` to see the content of the existed file.
	`cat '{}' \| xargs -d "\\n" rm -r` to delete the paths listed in the file.""".format(
	filename, filename))
	sys.exit()

	self.__start_at = datetime.datetime.now().replace(microsecond=0)

	def printFinal(self):
	print("")
	print("Calculation completed (cost {} seconds).".format(
	datetime.datetime.now().replace(microsecond=0) - self.__start_at))

	if len(self.duplicatedFiles) == 0:
	print("沒有發現重複的檔案喔～～～ No duplicated files found!")
	else:
	duplicatedFilePaths = [filePath for filePath, checksum in self.duplicatedFiles]
	with open(OUTPUT_FILE_NAME, 'w') as f:
	f.writelines(
	"\n".join([os.path.abspath(os.path.expanduser(p)) for p in duplicatedFilePaths]))

	if VERBOSE:
	print("{} duplicated files found:".format(len(duplicatedFilePaths)))
	print("=" * 68)
	print("\n".join(duplicatedFilePaths))
	print("=" * 68)
	print()
	print("This list has been saved in \"{}\"".format(OUTPUT_FILE_NAME))
	else:
	print("{} duplicated files found.".format(len(duplicatedFilePaths)))
	print()
	print("Detailed list has been saved in \"{}\"".format(OUTPUT_FILE_NAME))
	print("""You can use `cat '{}' \| xargs -d "\\n" rm -r` to delete them.""".format(OUTPUT_FILE_NAME))


	def popPath(self):
	# 得到hasher需要的 (relative) path
	try:
	filePath = self.allFiles.pop()
	self.index += 1
	return filePath
	except IndexError:
	return False

	def getCalculatedChecksumFromDB(self, filePath):
	"""return md5, mtime"""
	data = self.checksumsInDB.get(filePath)
	if data is None:
	return None, None
	return data[0], data[1]

	def handleFile(self, filePath, checksum, mtime, *,
	justRecalculated=False, thisIsExactlyDuplicated=False):
	"""Append filePath and checksum into self.duplicatedFiles.
	This is not DB's business at all."""
	# Add/update self.checksumsInDB
	if (thisIsExactlyDuplicated or
	checksum in self.seenChecksums):
	self.duplicatedFiles.append((filePath, checksum))
	else:
	self.seenChecksums.add(checksum)

	self.checksumsInDB[filePath] = [checksum, mtime]
	self.printProgress(filePath, checksum)


	def printProgress(self, filePath, checksum):
	print(self.percentage(),
	checksum, filePath,
	end=" \r")


	def percentage(self):
	return "[{}%] {}/{}".format(math.ceil((((self.index) / self.fileAmounts) * 100)),
	self.index,
	self.fileAmounts)

	def parseArguments():
	_parser = argparse.ArgumentParser(
	prog="find_duplicated_files",
	description='Find duplicated files via MD5 checksum and output the list as file.',
	formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	_parser.add_argument("-v", "--verbose", action='store_true', # 'count' is ok, too
	help='print all duplicated files on screen when finishing.')
	_parser.add_argument("-o", "--output", nargs="?", metavar="FILE NAME", default='duplicated_files.txt',
	help='specify output filename')
	_parser.add_argument("-d", "--db", nargs="?", metavar="DB FILE NAME", default='.checksums.db',
	help="specify the DB file's name for storing checksum")
	_parser.add_argument("-t", "--threads", nargs="?", metavar="NUMBER", type=int, default=4,
	help='specify thread amounts.')
	_parser.add_argument("path", metavar="DIRECTORY_PATH")
	return _parser.parse_args()


	def main():
	args = parseArguments()
	global OUTPUT_FILE_NAME, THREADS_AMOUNT, VERBOSE, DB_FILE_NAME
	OUTPUT_FILE_NAME = args.output
	THREADS_AMOUNT = args.threads
	VERBOSE = args.verbose
	DB_FILE_NAME = args.db

	FileWalker(args.path)



	if __name__ == "__main__":
	main()