Skip to content

Instantly share code, notes, and snippets.

@neelabhg
Last active February 7, 2022 20:57
Show Gist options
  • Save neelabhg/f4ba33c618ae695015b1df0902523d7e to your computer and use it in GitHub Desktop.
Save neelabhg/f4ba33c618ae695015b1df0902523d7e to your computer and use it in GitHub Desktop.
Python script to scan a directory tree and output information for files and directories to a JSON file
#!/usr/bin/env python3
"""
Scan a directory tree and output information for files and directories to a JSON file.
To use:
./scan_dir_tree.py --help
Using pyenv:
PYENV_VERSION=3.10.0 ./scan_dir_tree.py --help
"""
import sys
MIN_PYTHON = (3, 10)
if sys.version_info < MIN_PYTHON:
sys.exit("Python %s.%s or later is required." % MIN_PYTHON)
import os
import json
import logging
from argparse import ArgumentParser
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging.getLogger(__name__)
class FsItem:
def __init__(self, parent_path: str, name: str, kind: str) -> None:
self.kind = kind
self.name = name
self.path = os.path.join(parent_path, name)
self.name_length = len(self.name)
self.path_length = len(self.path)
self.num_direct_children = 0
def to_dict(self):
return self.__dict__
class FileItem(FsItem):
def __init__(self, parent_path: str, name: str) -> None:
super().__init__(parent_path, name, "file")
class DirectoryItem(FsItem):
def __init__(self, parent_path: str, name: str) -> None:
super().__init__(parent_path, name, "directory")
def get_fs_items(root_directory_path: str) -> tuple[list[DirectoryItem], list[FileItem]]:
assert os.path.isdir(root_directory_path)
directory_items_by_path: dict[str, DirectoryItem] = {}
file_items: list[FileItem] = []
directory_items_by_path[root_directory_path] = DirectoryItem(
os.path.dirname(root_directory_path),
os.path.basename(root_directory_path)
)
for (current_path, directory_names, file_names) in os.walk(root_directory_path):
assert current_path in directory_items_by_path
directory_items_by_path[current_path].num_direct_children = len(file_names) + len(directory_names)
for file_name in file_names:
file_items.append(FileItem(current_path, file_name))
for directory_name in directory_names:
dir_item = DirectoryItem(current_path, directory_name)
assert dir_item.path not in directory_items_by_path
directory_items_by_path[dir_item.path] = dir_item
directory_items = list(directory_items_by_path.values())
return (directory_items, file_items)
def save_fs_items(root_directory_path: str, output_file_path: str) -> None:
directory_items, file_items = get_fs_items(root_directory_path)
all_items: list[FsItem] = directory_items + file_items
sorted_items = [i.to_dict() for i in sorted(all_items, key=lambda fs_item: fs_item.path_length)]
obj = {
"num_items": len(sorted_items),
"items": sorted_items,
}
with open(output_file_path, "w", encoding="utf-8") as f:
json.dump(obj, f, indent=4)
def sort_results(input_file_path: str, output_file_path: str, sort_key: str) -> None:
with open(input_file_path, "r", encoding="utf-8") as f:
obj = json.load(f)
items = obj["items"]
items = sorted(items, key=lambda i: i[sort_key], reverse=True)
with open(output_file_path, "w", encoding="utf-8") as f:
json.dump(items, f, indent=4)
def main() -> None:
argument_parser = ArgumentParser()
subparsers = argument_parser.add_subparsers(dest="subcommand", required=True, help="Command to run")
parser_run = subparsers.add_parser("run")
parser_run.add_argument("root_directory_path", help="Path to scan")
parser_sort_results = subparsers.add_parser("sort_results")
parser_sort_results.add_argument("sort_key", help="An attribute of FsItem for reverse sorting the results")
args = argument_parser.parse_args()
if args.subcommand == "run":
save_fs_items(args.root_directory_path, "./results.json")
elif args.subcommand == "sort_results":
sort_results("./results.json", "./sorted_results.json", args.sort_key)
if __name__ == "__main__":
main()
else:
logger.error("Run as standalone script only")
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment