Skip to content

Instantly share code, notes, and snippets.

@mhrstmnn
Last active June 22, 2023 01:20
Show Gist options
  • Save mhrstmnn/d4a9dd350e73606122b7cf19ce45e0b8 to your computer and use it in GitHub Desktop.
Save mhrstmnn/d4a9dd350e73606122b7cf19ce45e0b8 to your computer and use it in GitHub Desktop.
Find duplicate files recursively in all subdirectories starting in the directory passed as an argument (or in the current working directory) using checksums (SHA-256)
#!/usr/bin/env python3
import sys
import os
import hashlib
class Directory:
def __init__(self, path: str) -> None:
self.path = path
entries = self.get_directory_entries(path)
self.directories = self.get_directories(entries)
self.files = self.get_files(entries)
def get_directory_entries(self, directory_path: str) -> list[str]:
return list(map(lambda entry: os.path.join(directory_path, entry),
filter(lambda entry: not entry.startswith('.'), os.listdir(directory_path))))
def get_directories(self, entries: list[str]) -> list['Directory']:
return list(map(lambda directory: Directory(directory),
filter(lambda entry: os.path.isdir(entry), entries)))
def get_files(self, entries: list[str]) -> list['File']:
return list(map(lambda file: File(file),
filter(lambda entry: not os.path.isdir(entry), entries)))
class File:
checksums: dict[str, list[str]] = {}
def __init__(self, path: str) -> None:
self.path = path
self.checksum = self.get_file_checksum(path)
if self.checksum not in self.checksums:
self.checksums[self.checksum] = [self.path]
else:
self.checksums[self.checksum].append(self.path)
def get_file_checksum(self, file_path: str) -> str:
with open(file_path, 'rb') as file:
digest = hashlib.file_digest(file, 'sha256')
return digest.hexdigest()
def print_directories(directory: Directory, directory_path: str, subdirectory=False) -> None:
if not subdirectory:
print('Directory:', '\'' + directory.path + '\'')
else:
print('Subdirectory:', '\'.' + directory.path.removeprefix(directory_path) + '\'')
print_files(list(map(lambda file: os.path.basename(file.path), directory.files)))
print()
for subdirectory in directory.directories:
print_directories(subdirectory, directory_path, True)
def print_files(files: list[str]) -> None:
if len(files) > 0:
print('Files: [',
',\n'.join(map(lambda file: ' \'' + file + '\'', files)),
']', sep='\n')
else:
print('No files found in this directory!')
def print_identical_files(directory_path: str) -> None:
identical_files = dict(filter(lambda checksum: len(checksum[1]) > 1, File.checksums.items()))
if len(identical_files) > 0:
print('\nThe identical files are (found using SHA-256):')
for checksum in identical_files:
print('\nChecksum:', '\'' + checksum + '\'')
print_files(list(map(lambda file: '.' + file.removeprefix(directory_path),
File.checksums[checksum])))
else:
print('\nNo identical files found (using SHA-256)!')
def main() -> int:
directory_path: str
if len(sys.argv) <= 1:
print('No directory was passed as an argument!')
print('The current working directory is used instead …\n')
directory_path = os.getcwd()
else:
print('A directory was passed as an argument …\n')
arguments = sys.argv
arguments.pop(0)
directory_path = os.path.abspath(' '.join(arguments))
try:
directory = Directory(directory_path)
except (FileNotFoundError, NotADirectoryError) as error:
print('But an error occurred:', end=' ')
if type(error) == FileNotFoundError:
print('No such directory!')
elif type(error) == NotADirectoryError:
print('Not a directory!')
print('Argument:', '\'' + directory_path + '\'')
return 1
else:
print_directories(directory, directory_path)
print_identical_files(directory_path)
return 0
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment