Last active May 30, 2022 22:51
#!/usr/bin/env python3
Python script that takes a directory, and displays files in that directory
tree with the same sizes, grouped by size.
Useful for hunting down duplicate files.
from argparse import ArgumentParser, ArgumentTypeError
from os import walk
from os.path import getsize
from pathlib import Path
from typing import DefaultDict, Generator, List
def main() -> None:
parser = ArgumentParser("Find files with the same sizes in a directory tree.")
help="The directory to start looking in.",
args = parser.parse_args()
file_sizes = DefaultDict[float, List[Path]](list)
for path in iter_files(args.starting_point):
for size in sorted(file_sizes.keys()):
if len(file_sizes[size]) > 1:
for path in file_sizes[size]:
def iter_files(starting_point: str):
for dirpath, _, filenames in walk(starting_point):
for filename in filenames:
path = Path(dirpath) / Path(filename)
# This check is for dangling symlinks.
if path.is_file():
yield path
def directory(path: str) -> Path:
dir_path = Path(path)
if not dir_path.is_dir():
raise ArgumentTypeError("not a directory")
return dir_path
def sizeof_fmt(num: float) -> str:
for x in ["bytes", "KB", "MB", "GB"]:
if num < 1024.0:
return f"{num:3.1f}{x}"
num /= 1024.0
return f"{num:3.1f}TB"
if __name__ == "__main__":
