Skip to content

Instantly share code, notes, and snippets.

@duganchen
Last active May 30, 2022 22:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save duganchen/1e917c11fce44267b4c4 to your computer and use it in GitHub Desktop.
Save duganchen/1e917c11fce44267b4c4 to your computer and use it in GitHub Desktop.
dupe_files.py
#!/usr/bin/env python3
"""
Python script that takes a directory, and displays files in that directory
tree with the same sizes, grouped by size.
Useful for hunting down duplicate files.
"""
from argparse import ArgumentParser, ArgumentTypeError
from os import walk
from os.path import getsize
from pathlib import Path
from typing import DefaultDict, Generator, List
def main() -> None:
parser = ArgumentParser("Find files with the same sizes in a directory tree.")
parser.add_argument(
"starting_point",
type=directory,
help="The directory to start looking in.",
)
args = parser.parse_args()
file_sizes = DefaultDict[float, List[Path]](list)
for path in iter_files(args.starting_point):
file_sizes[getsize(path)].append(path)
for size in sorted(file_sizes.keys()):
if len(file_sizes[size]) > 1:
print(sizeof_fmt(size))
for path in file_sizes[size]:
print(f"\t{path}")
def iter_files(starting_point: str):
for dirpath, _, filenames in walk(starting_point):
for filename in filenames:
path = Path(dirpath) / Path(filename)
# This check is for dangling symlinks.
if path.is_file():
yield path
def directory(path: str) -> Path:
dir_path = Path(path)
if not dir_path.is_dir():
raise ArgumentTypeError("not a directory")
return dir_path
def sizeof_fmt(num: float) -> str:
# http://stackoverflow.com/a/1094933/240515
for x in ["bytes", "KB", "MB", "GB"]:
if num < 1024.0:
return f"{num:3.1f}{x}"
num /= 1024.0
return f"{num:3.1f}TB"
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment