Created
February 27, 2024 17:47
-
-
Save platomav/2b1a5455a7b727626d528ae5e1933b76 to your computer and use it in GitHub Desktop.
Duplicate file encoding finder (and optionally deleter), for Posix/NT, written in Python 3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# coding=utf-8 | |
import hashlib | |
import os | |
import json | |
from argparse import ArgumentParser, Namespace | |
from chardet.universaldetector import UniversalDetector as ChardetDetector | |
from charset_normalizer import detect as charset_detector | |
from tqdm import tqdm | |
def _delete_path(input_path: str) -> None: | |
if _is_valid_path(input_path=input_path, allow_broken_links=True): | |
os.remove(input_path) | |
print(f' {input_path} [Deleted]') | |
else: | |
print(f' {input_path} [Error]') | |
def _is_valid_path(input_path: str, allow_broken_links: bool = False) -> bool: | |
input_path_abs: str = os.path.abspath(input_path) | |
if os.path.lexists(input_path_abs): | |
if not os.path.isdir(input_path_abs): | |
if allow_broken_links: | |
return os.path.isfile(input_path_abs) or os.path.islink(input_path_abs) | |
return os.path.isfile(input_path_abs) | |
return False | |
def _get_path_files(check_paths: list) -> list[str]: | |
path_files: list[str] = [] | |
for check_path in check_paths: | |
check_path_abs: str = os.path.abspath(check_path) | |
if os.path.isdir(check_path_abs): | |
# noinspection PyArgumentEqualDefault | |
for root_path, _, file_names in os.walk(check_path_abs, followlinks=False): | |
for file_name in file_names: | |
path_files.append(os.path.join(root_path, file_name)) | |
elif _is_valid_path(input_path=check_path_abs, allow_broken_links=True): | |
path_files.append(check_path_abs) | |
return path_files | |
def _get_encoding_chardet(in_path: str) -> str | None: | |
with open(in_path, 'rb') as in_file: | |
in_lines: list = in_file.readlines() | |
chardet_detector: ChardetDetector = ChardetDetector() | |
for in_line in in_lines: | |
chardet_detector.feed(in_line) | |
chardet_detector.close() | |
return chardet_detector.result['encoding'] | |
def _get_encoding_charset(in_path: str) -> str | None: | |
with open(in_path, 'rb') as in_file: | |
in_buffer: bytes = in_file.read() | |
return charset_detector(in_buffer)['encoding'] | |
def _detect_encoding(in_path: str) -> str: | |
chardet_result: str | None = _get_encoding_chardet(in_path) | |
charset_result: str | None = _get_encoding_charset(in_path) | |
if chardet_result != charset_result and chardet_result and charset_result: | |
CHARSET_PREFER: list[tuple] = [ | |
('cp949', 'windows-1250'), | |
('euc-jp', 'big5'), | |
('macroman', 'cp932'), | |
('macroman', 'windows-1256') | |
] | |
chardet_result_norm: str = chardet_result.lower().replace('_', '-') | |
charset_result_norm: str = charset_result.lower().replace('_', '-') | |
if '-sig' in chardet_result_norm: | |
encoding_result: str = chardet_result | |
elif '-sig' in charset_result_norm: | |
encoding_result = charset_result | |
elif 'utf-8' in chardet_result_norm: | |
encoding_result = chardet_result | |
elif 'utf-8' in charset_result_norm: | |
encoding_result = charset_result | |
elif (chardet_result_norm, charset_result_norm) in CHARSET_PREFER: | |
encoding_result = charset_result | |
else: | |
encoding_result = chardet_result | |
else: | |
encoding_result = chardet_result or charset_result or 'utf-8' | |
return encoding_result | |
def check_duplicates(in_paths: list, is_delete: bool) -> None: | |
file_text_hashes: dict[str, set] = {} | |
for file_path in tqdm(_get_path_files(in_paths)): | |
with open(file_path, 'r', encoding=_detect_encoding(file_path), errors='ignore') as file_pointer: | |
file_text: str = file_pointer.read() | |
file_text_hash: str = hashlib.sha1(file_text.encode('utf-8')).hexdigest() | |
if file_text_hash not in file_text_hashes: | |
file_text_hashes[file_text_hash] = set() | |
file_text_hashes[file_text_hash].add(file_path) | |
for file_text_hash, file_paths in file_text_hashes.items(): | |
file_paths_count: int = len(file_paths) | |
if file_paths_count > 1: | |
print(f'\nFound {file_paths_count} files with text hash {file_text_hash}') | |
for file_index, file_path in enumerate(sorted(file_paths, key=lambda fp: (len(fp), fp))): | |
if file_index == 0: | |
print(f' {file_path} [Original]') | |
else: | |
if is_delete: | |
_delete_path(input_path=file_path) | |
else: | |
print(f' {file_path} [Duplicate]') | |
if __name__ == "__main__": | |
parser: ArgumentParser = ArgumentParser() | |
parser.add_argument('paths', nargs='*') | |
parser.add_argument('-d', '--delete', help='delete duplicate files', action='store_true') | |
arguments: Namespace = parser.parse_args() | |
if arguments.paths: | |
check_duplicates(in_paths=arguments.paths, is_delete=arguments.delete) | |
else: | |
parser.print_help() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment