Last active
June 22, 2023 01:20
-
-
Save mhrstmnn/d4a9dd350e73606122b7cf19ce45e0b8 to your computer and use it in GitHub Desktop.
Find duplicate files recursively in all subdirectories starting in the directory passed as an argument (or in the current working directory) using checksums (SHA-256)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import os | |
import hashlib | |
class Directory: | |
def __init__(self, path: str) -> None: | |
self.path = path | |
entries = self.get_directory_entries(path) | |
self.directories = self.get_directories(entries) | |
self.files = self.get_files(entries) | |
def get_directory_entries(self, directory_path: str) -> list[str]: | |
return list(map(lambda entry: os.path.join(directory_path, entry), | |
filter(lambda entry: not entry.startswith('.'), os.listdir(directory_path)))) | |
def get_directories(self, entries: list[str]) -> list['Directory']: | |
return list(map(lambda directory: Directory(directory), | |
filter(lambda entry: os.path.isdir(entry), entries))) | |
def get_files(self, entries: list[str]) -> list['File']: | |
return list(map(lambda file: File(file), | |
filter(lambda entry: not os.path.isdir(entry), entries))) | |
class File: | |
checksums: dict[str, list[str]] = {} | |
def __init__(self, path: str) -> None: | |
self.path = path | |
self.checksum = self.get_file_checksum(path) | |
if self.checksum not in self.checksums: | |
self.checksums[self.checksum] = [self.path] | |
else: | |
self.checksums[self.checksum].append(self.path) | |
def get_file_checksum(self, file_path: str) -> str: | |
with open(file_path, 'rb') as file: | |
digest = hashlib.file_digest(file, 'sha256') | |
return digest.hexdigest() | |
def print_directories(directory: Directory, directory_path: str, subdirectory=False) -> None: | |
if not subdirectory: | |
print('Directory:', '\'' + directory.path + '\'') | |
else: | |
print('Subdirectory:', '\'.' + directory.path.removeprefix(directory_path) + '\'') | |
print_files(list(map(lambda file: os.path.basename(file.path), directory.files))) | |
print() | |
for subdirectory in directory.directories: | |
print_directories(subdirectory, directory_path, True) | |
def print_files(files: list[str]) -> None: | |
if len(files) > 0: | |
print('Files: [', | |
',\n'.join(map(lambda file: ' \'' + file + '\'', files)), | |
']', sep='\n') | |
else: | |
print('No files found in this directory!') | |
def print_identical_files(directory_path: str) -> None: | |
identical_files = dict(filter(lambda checksum: len(checksum[1]) > 1, File.checksums.items())) | |
if len(identical_files) > 0: | |
print('\nThe identical files are (found using SHA-256):') | |
for checksum in identical_files: | |
print('\nChecksum:', '\'' + checksum + '\'') | |
print_files(list(map(lambda file: '.' + file.removeprefix(directory_path), | |
File.checksums[checksum]))) | |
else: | |
print('\nNo identical files found (using SHA-256)!') | |
def main() -> int: | |
directory_path: str | |
if len(sys.argv) <= 1: | |
print('No directory was passed as an argument!') | |
print('The current working directory is used instead …\n') | |
directory_path = os.getcwd() | |
else: | |
print('A directory was passed as an argument …\n') | |
arguments = sys.argv | |
arguments.pop(0) | |
directory_path = os.path.abspath(' '.join(arguments)) | |
try: | |
directory = Directory(directory_path) | |
except (FileNotFoundError, NotADirectoryError) as error: | |
print('But an error occurred:', end=' ') | |
if type(error) == FileNotFoundError: | |
print('No such directory!') | |
elif type(error) == NotADirectoryError: | |
print('Not a directory!') | |
print('Argument:', '\'' + directory_path + '\'') | |
return 1 | |
else: | |
print_directories(directory, directory_path) | |
print_identical_files(directory_path) | |
return 0 | |
if __name__ == '__main__': | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment