foru17/remove_duplicates.py

## remove_duplicates.py
import os
import re
import hashlib

# 设定工作目录
DIR = "/Volumes/Desktop"

def calculate_hash(filepath):
    hasher = hashlib.sha256()
    with open(filepath, 'rb') as f:
        while True:
            data = f.read(65536)  # read in 64k chunks
            if not data:
                break
            hasher.update(data)
    return hasher.hexdigest()

# 简单过滤例如 file(1).jpg file(2).png 之类的重复文件, hash 校验是否相同
def remove_duplicates(root_dir):
    pattern = re.compile(r"(.*) \(\d+\)(\..*)?$")

    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            match = pattern.match(filename)
            # print(f"match {filename}")  # Fix the indentation and f-string format here
            if match:
                original_file = match.group(1) + (match.group(2) or "")
                original_filepath = os.path.join(dirpath, original_file)
                duplicate_filepath = os.path.join(dirpath, filename)

                if os.path.exists(original_filepath):
                    original_hash = calculate_hash(original_filepath)
                    duplicate_hash = calculate_hash(duplicate_filepath)

                    if original_hash == duplicate_hash:
                        os.remove(duplicate_filepath)
                        print(f"Removed {duplicate_filepath}")

remove_duplicates(DIR)
	import os
	import re
	import hashlib

	# 设定工作目录
	DIR = "/Volumes/Desktop"

	def calculate_hash(filepath):
	hasher = hashlib.sha256()
	with open(filepath, 'rb') as f:
	while True:
	data = f.read(65536) # read in 64k chunks
	if not data:
	break
	hasher.update(data)
	return hasher.hexdigest()

	# 简单过滤例如 file(1).jpg file(2).png 之类的重复文件, hash 校验是否相同
	def remove_duplicates(root_dir):
	pattern = re.compile(r"(.) \(\d+\)(\..)?$")

	for dirpath, dirnames, filenames in os.walk(root_dir):
	for filename in filenames:
	match = pattern.match(filename)
	# print(f"match {filename}") # Fix the indentation and f-string format here
	if match:
	original_file = match.group(1) + (match.group(2) or "")
	original_filepath = os.path.join(dirpath, original_file)
	duplicate_filepath = os.path.join(dirpath, filename)

	if os.path.exists(original_filepath):
	original_hash = calculate_hash(original_filepath)
	duplicate_hash = calculate_hash(duplicate_filepath)

	if original_hash == duplicate_hash:
	os.remove(duplicate_filepath)
	print(f"Removed {duplicate_filepath}")

	remove_duplicates(DIR)