platomav/duplicates_encodings.py

## duplicates_encodings.py
#!/usr/bin/env python3
# coding=utf-8

import hashlib
import os
import json

from argparse import ArgumentParser, Namespace

from chardet.universaldetector import UniversalDetector as ChardetDetector
from charset_normalizer import detect as charset_detector
from tqdm import tqdm


def _delete_path(input_path: str) -> None:
    if _is_valid_path(input_path=input_path, allow_broken_links=True):
        os.remove(input_path)

        print(f'  {input_path} [Deleted]')
    else:
        print(f'  {input_path} [Error]')

def _is_valid_path(input_path: str, allow_broken_links: bool = False) -> bool:
    input_path_abs: str = os.path.abspath(input_path)

    if os.path.lexists(input_path_abs):
        if not os.path.isdir(input_path_abs):
            if allow_broken_links:
                return os.path.isfile(input_path_abs) or os.path.islink(input_path_abs)

            return os.path.isfile(input_path_abs)

    return False

def _get_path_files(check_paths: list) -> list[str]:
    path_files: list[str] = []

    for check_path in check_paths:
        check_path_abs: str = os.path.abspath(check_path)

        if os.path.isdir(check_path_abs):
            # noinspection PyArgumentEqualDefault
            for root_path, _, file_names in os.walk(check_path_abs, followlinks=False):
                for file_name in file_names:
                    path_files.append(os.path.join(root_path, file_name))
        elif _is_valid_path(input_path=check_path_abs, allow_broken_links=True):
            path_files.append(check_path_abs)

    return path_files

def _get_encoding_chardet(in_path: str) -> str | None:
    with open(in_path, 'rb') as in_file:
        in_lines: list = in_file.readlines()

    chardet_detector: ChardetDetector = ChardetDetector()

    for in_line in in_lines:
        chardet_detector.feed(in_line)

    chardet_detector.close()

    return chardet_detector.result['encoding']

def _get_encoding_charset(in_path: str) -> str | None:
    with open(in_path, 'rb') as in_file:
        in_buffer: bytes = in_file.read()

    return charset_detector(in_buffer)['encoding']

def _detect_encoding(in_path: str) -> str:
    chardet_result: str | None = _get_encoding_chardet(in_path)
    charset_result: str | None = _get_encoding_charset(in_path)

    if chardet_result != charset_result and chardet_result and charset_result:
        CHARSET_PREFER: list[tuple] = [
            ('cp949', 'windows-1250'),
            ('euc-jp', 'big5'),
            ('macroman', 'cp932'),
            ('macroman', 'windows-1256')
        ]

        chardet_result_norm: str = chardet_result.lower().replace('_', '-')
        charset_result_norm: str = charset_result.lower().replace('_', '-')

        if '-sig' in chardet_result_norm:
            encoding_result: str = chardet_result
        elif '-sig' in charset_result_norm:
            encoding_result = charset_result
        elif 'utf-8' in chardet_result_norm:
            encoding_result = chardet_result
        elif 'utf-8' in charset_result_norm:
            encoding_result = charset_result
        elif (chardet_result_norm, charset_result_norm) in CHARSET_PREFER:
            encoding_result = charset_result
        else:
            encoding_result = chardet_result
    else:
        encoding_result = chardet_result or charset_result or 'utf-8'

    return encoding_result

def check_duplicates(in_paths: list, is_delete: bool) -> None:
    file_text_hashes: dict[str, set] = {}

    for file_path in tqdm(_get_path_files(in_paths)):
        with open(file_path, 'r', encoding=_detect_encoding(file_path), errors='ignore') as file_pointer:
            file_text: str = file_pointer.read()

        file_text_hash: str = hashlib.sha1(file_text.encode('utf-8')).hexdigest()

        if file_text_hash not in file_text_hashes:
            file_text_hashes[file_text_hash] = set()

        file_text_hashes[file_text_hash].add(file_path)

    for file_text_hash, file_paths in file_text_hashes.items():
        file_paths_count: int = len(file_paths)

        if file_paths_count > 1:
            print(f'\nFound {file_paths_count} files with text hash {file_text_hash}')

            for file_index, file_path in enumerate(sorted(file_paths, key=lambda fp: (len(fp), fp))):
                if file_index == 0:
                    print(f'  {file_path} [Original]')
                else:
                    if is_delete:
                        _delete_path(input_path=file_path)
                    else:
                        print(f'  {file_path} [Duplicate]')

if __name__ == "__main__":
    parser: ArgumentParser = ArgumentParser()

    parser.add_argument('paths', nargs='*')
    parser.add_argument('-d', '--delete', help='delete duplicate files', action='store_true')

    arguments: Namespace = parser.parse_args()

    if arguments.paths:
        check_duplicates(in_paths=arguments.paths, is_delete=arguments.delete)
    else:
        parser.print_help()
	#!/usr/bin/env python3
	# coding=utf-8

	import hashlib
	import os
	import json

	from argparse import ArgumentParser, Namespace

	from chardet.universaldetector import UniversalDetector as ChardetDetector
	from charset_normalizer import detect as charset_detector
	from tqdm import tqdm


	def _delete_path(input_path: str) -> None:
	if _is_valid_path(input_path=input_path, allow_broken_links=True):
	os.remove(input_path)

	print(f' {input_path} [Deleted]')
	else:
	print(f' {input_path} [Error]')

	def _is_valid_path(input_path: str, allow_broken_links: bool = False) -> bool:
	input_path_abs: str = os.path.abspath(input_path)

	if os.path.lexists(input_path_abs):
	if not os.path.isdir(input_path_abs):
	if allow_broken_links:
	return os.path.isfile(input_path_abs) or os.path.islink(input_path_abs)

	return os.path.isfile(input_path_abs)

	return False

	def _get_path_files(check_paths: list) -> list[str]:
	path_files: list[str] = []

	for check_path in check_paths:
	check_path_abs: str = os.path.abspath(check_path)

	if os.path.isdir(check_path_abs):
	# noinspection PyArgumentEqualDefault
	for root_path, _, file_names in os.walk(check_path_abs, followlinks=False):
	for file_name in file_names:
	path_files.append(os.path.join(root_path, file_name))
	elif _is_valid_path(input_path=check_path_abs, allow_broken_links=True):
	path_files.append(check_path_abs)

	return path_files

	def _get_encoding_chardet(in_path: str) -> str \| None:
	with open(in_path, 'rb') as in_file:
	in_lines: list = in_file.readlines()

	chardet_detector: ChardetDetector = ChardetDetector()

	for in_line in in_lines:
	chardet_detector.feed(in_line)

	chardet_detector.close()

	return chardet_detector.result['encoding']

	def _get_encoding_charset(in_path: str) -> str \| None:
	with open(in_path, 'rb') as in_file:
	in_buffer: bytes = in_file.read()

	return charset_detector(in_buffer)['encoding']

	def _detect_encoding(in_path: str) -> str:
	chardet_result: str \| None = _get_encoding_chardet(in_path)
	charset_result: str \| None = _get_encoding_charset(in_path)

	if chardet_result != charset_result and chardet_result and charset_result:
	CHARSET_PREFER: list[tuple] = [
	('cp949', 'windows-1250'),
	('euc-jp', 'big5'),
	('macroman', 'cp932'),
	('macroman', 'windows-1256')
	]

	chardet_result_norm: str = chardet_result.lower().replace('_', '-')
	charset_result_norm: str = charset_result.lower().replace('_', '-')

	if '-sig' in chardet_result_norm:
	encoding_result: str = chardet_result
	elif '-sig' in charset_result_norm:
	encoding_result = charset_result
	elif 'utf-8' in chardet_result_norm:
	encoding_result = chardet_result
	elif 'utf-8' in charset_result_norm:
	encoding_result = charset_result
	elif (chardet_result_norm, charset_result_norm) in CHARSET_PREFER:
	encoding_result = charset_result
	else:
	encoding_result = chardet_result
	else:
	encoding_result = chardet_result or charset_result or 'utf-8'

	return encoding_result

	def check_duplicates(in_paths: list, is_delete: bool) -> None:
	file_text_hashes: dict[str, set] = {}

	for file_path in tqdm(_get_path_files(in_paths)):
	with open(file_path, 'r', encoding=_detect_encoding(file_path), errors='ignore') as file_pointer:
	file_text: str = file_pointer.read()

	file_text_hash: str = hashlib.sha1(file_text.encode('utf-8')).hexdigest()

	if file_text_hash not in file_text_hashes:
	file_text_hashes[file_text_hash] = set()

	file_text_hashes[file_text_hash].add(file_path)

	for file_text_hash, file_paths in file_text_hashes.items():
	file_paths_count: int = len(file_paths)

	if file_paths_count > 1:
	print(f'\nFound {file_paths_count} files with text hash {file_text_hash}')

	for file_index, file_path in enumerate(sorted(file_paths, key=lambda fp: (len(fp), fp))):
	if file_index == 0:
	print(f' {file_path} [Original]')
	else:
	if is_delete:
	_delete_path(input_path=file_path)
	else:
	print(f' {file_path} [Duplicate]')

	if __name__ == "__main__":
	parser: ArgumentParser = ArgumentParser()

	parser.add_argument('paths', nargs='*')
	parser.add_argument('-d', '--delete', help='delete duplicate files', action='store_true')

	arguments: Namespace = parser.parse_args()

	if arguments.paths:
	check_duplicates(in_paths=arguments.paths, is_delete=arguments.delete)
	else:
	parser.print_help()