Skip to content

Instantly share code, notes, and snippets.

Created February 27, 2024 17:47
Show Gist options
  • Save platomav/2b1a5455a7b727626d528ae5e1933b76 to your computer and use it in GitHub Desktop.
Save platomav/2b1a5455a7b727626d528ae5e1933b76 to your computer and use it in GitHub Desktop.
Duplicate file encoding finder (and optionally deleter), for Posix/NT, written in Python 3
#!/usr/bin/env python3
# coding=utf-8
import hashlib
import os
import json
from argparse import ArgumentParser, Namespace
from chardet.universaldetector import UniversalDetector as ChardetDetector
from charset_normalizer import detect as charset_detector
from tqdm import tqdm
def _delete_path(input_path: str) -> None:
if _is_valid_path(input_path=input_path, allow_broken_links=True):
print(f' {input_path} [Deleted]')
print(f' {input_path} [Error]')
def _is_valid_path(input_path: str, allow_broken_links: bool = False) -> bool:
input_path_abs: str = os.path.abspath(input_path)
if os.path.lexists(input_path_abs):
if not os.path.isdir(input_path_abs):
if allow_broken_links:
return os.path.isfile(input_path_abs) or os.path.islink(input_path_abs)
return os.path.isfile(input_path_abs)
return False
def _get_path_files(check_paths: list) -> list[str]:
path_files: list[str] = []
for check_path in check_paths:
check_path_abs: str = os.path.abspath(check_path)
if os.path.isdir(check_path_abs):
# noinspection PyArgumentEqualDefault
for root_path, _, file_names in os.walk(check_path_abs, followlinks=False):
for file_name in file_names:
path_files.append(os.path.join(root_path, file_name))
elif _is_valid_path(input_path=check_path_abs, allow_broken_links=True):
return path_files
def _get_encoding_chardet(in_path: str) -> str | None:
with open(in_path, 'rb') as in_file:
in_lines: list = in_file.readlines()
chardet_detector: ChardetDetector = ChardetDetector()
for in_line in in_lines:
return chardet_detector.result['encoding']
def _get_encoding_charset(in_path: str) -> str | None:
with open(in_path, 'rb') as in_file:
in_buffer: bytes =
return charset_detector(in_buffer)['encoding']
def _detect_encoding(in_path: str) -> str:
chardet_result: str | None = _get_encoding_chardet(in_path)
charset_result: str | None = _get_encoding_charset(in_path)
if chardet_result != charset_result and chardet_result and charset_result:
CHARSET_PREFER: list[tuple] = [
('cp949', 'windows-1250'),
('euc-jp', 'big5'),
('macroman', 'cp932'),
('macroman', 'windows-1256')
chardet_result_norm: str = chardet_result.lower().replace('_', '-')
charset_result_norm: str = charset_result.lower().replace('_', '-')
if '-sig' in chardet_result_norm:
encoding_result: str = chardet_result
elif '-sig' in charset_result_norm:
encoding_result = charset_result
elif 'utf-8' in chardet_result_norm:
encoding_result = chardet_result
elif 'utf-8' in charset_result_norm:
encoding_result = charset_result
elif (chardet_result_norm, charset_result_norm) in CHARSET_PREFER:
encoding_result = charset_result
encoding_result = chardet_result
encoding_result = chardet_result or charset_result or 'utf-8'
return encoding_result
def check_duplicates(in_paths: list, is_delete: bool) -> None:
file_text_hashes: dict[str, set] = {}
for file_path in tqdm(_get_path_files(in_paths)):
with open(file_path, 'r', encoding=_detect_encoding(file_path), errors='ignore') as file_pointer:
file_text: str =
file_text_hash: str = hashlib.sha1(file_text.encode('utf-8')).hexdigest()
if file_text_hash not in file_text_hashes:
file_text_hashes[file_text_hash] = set()
for file_text_hash, file_paths in file_text_hashes.items():
file_paths_count: int = len(file_paths)
if file_paths_count > 1:
print(f'\nFound {file_paths_count} files with text hash {file_text_hash}')
for file_index, file_path in enumerate(sorted(file_paths, key=lambda fp: (len(fp), fp))):
if file_index == 0:
print(f' {file_path} [Original]')
if is_delete:
print(f' {file_path} [Duplicate]')
if __name__ == "__main__":
parser: ArgumentParser = ArgumentParser()
parser.add_argument('paths', nargs='*')
parser.add_argument('-d', '--delete', help='delete duplicate files', action='store_true')
arguments: Namespace = parser.parse_args()
if arguments.paths:
check_duplicates(in_paths=arguments.paths, is_delete=arguments.delete)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment