Skip to content

Instantly share code, notes, and snippets.

@ph33nx
Last active January 5, 2025 18:59
Show Gist options
  • Save ph33nx/12ce315ef6dbcf9ef1d01f5371af4a3d to your computer and use it in GitHub Desktop.
Save ph33nx/12ce315ef6dbcf9ef1d01f5371af4a3d to your computer and use it in GitHub Desktop.
project2yaml.py - Python script that Exports Project Structure and File Contents to YAML for LLM, ChatGPT, Agent Based Workflows. Scans a project's directory structure and exports the file hierarchy and contents into a clean, human-readable YAML file. This script is designed to simplify AI workflows, including those leveraging ChatGPT, LLMs, and…
#!/usr/bin/env python3
"""
project2yaml.py
Python script to scan a project's directory structure and export it to a YAML file.
Respects .gitignore rules if present, handles subdirectories, and preserves file contents.
Author: ph33nx
GitHub: https://github.com/ph33nx/
Features:
- Scans all files in a directory and subdirectories.
- Excludes files/directories specified in .gitignore.
- Always ignores `.git` and hidden files/folders (e.g., `.env`, `.vscode`).
- Skips binary files like images and executables.
- Outputs a well-structured YAML file with project structure, file contents, and a placeholder tasks section.
- Dynamically installs PyYAML if it's not already installed.
- Enforces path input; shows help if no path is provided.
Usage:
python3 project2yaml.py [project_path]
Example:
python3 project2yaml.py .
"""
import os
import fnmatch
import argparse
import logging
import subprocess
import sys
# Set up logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
)
# Attempt to import PyYAML
try:
import yaml
except ModuleNotFoundError:
logging.warning("PyYAML module not found. Attempting to install it...")
try:
subprocess.check_call([sys.executable, "-m", "pip", "install", "pyyaml"])
import yaml
logging.info("PyYAML successfully installed!")
except Exception as e:
logging.critical(f"Failed to install PyYAML: {e}")
sys.exit(1)
def parse_gitignore(base_path):
"""
Parse the .gitignore file in the project root and extract ignore patterns.
If .gitignore doesn't exist, return an empty list.
"""
gitignore_path = os.path.join(base_path, ".gitignore")
ignore_patterns = []
if os.path.exists(gitignore_path):
logging.info(f"Found .gitignore at {gitignore_path}. Parsing...")
try:
with open(gitignore_path, "r") as f:
ignore_patterns = [
line.strip()
for line in f
if line.strip() and not line.startswith("#")
]
except Exception as e:
logging.error(f"Error reading .gitignore: {e}")
else:
logging.info(".gitignore not found. Proceeding without ignoring files.")
# Always ignore `.git` and hidden files/folders
ignore_patterns.extend([".git", ".*"])
return ignore_patterns
def is_ignored(path, base_path, ignore_patterns):
"""
Check if a file or directory matches any ignore patterns from .gitignore.
Handles both files and directories.
"""
relative_path = os.path.relpath(path, base_path).replace(
"\\", "/"
) # Normalize paths
for pattern in ignore_patterns:
if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(
relative_path, f"{pattern}/"
):
logging.debug(f"Ignored: {relative_path} (matches pattern: {pattern})")
return True
return False
def is_binary(file_path):
"""
Check if a file is binary by reading its first 1024 bytes.
"""
try:
with open(file_path, "rb") as f:
chunk = f.read(1024)
return b"\0" in chunk # Check for null byte, common in binary files
except Exception as e:
logging.error(f"Error checking if file is binary: {file_path}. {e}")
return True # Treat unreadable files as binary
def read_project_structure(base_path, ignore_patterns):
"""
Traverse the project directory, read file names and contents, and ignore files listed in .gitignore or binary files.
"""
project_data = {}
for root, dirs, files in os.walk(base_path):
# Filter out ignored directories
dirs[:] = [
d
for d in dirs
if not is_ignored(os.path.join(root, d), base_path, ignore_patterns)
]
for file in files:
full_file_path = os.path.join(root, file)
# Skip ignored files
if is_ignored(full_file_path, base_path, ignore_patterns):
continue
# Skip binary files
if is_binary(full_file_path):
logging.info(f"Skipping binary file: {full_file_path}")
continue
# Read file contents
relative_path = os.path.relpath(full_file_path, base_path)
try:
with open(full_file_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
logging.info(f"Read file: {relative_path}")
except Exception as e:
logging.error(f"Error reading file {relative_path}: {e}")
content = None # Skip unreadable files
project_data[relative_path] = content
return project_data
def generate_yaml(project_data, output_path):
"""
Write the project structure, file contents, and placeholder tasks to a YAML file in the root directory.
"""
yaml_file_path = os.path.join(output_path, "project_structure.yaml")
# Custom representer to use block style for multi-line strings
def str_presenter(dumper, data):
if "\n" in data: # Use block style if the string contains newlines
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
return dumper.represent_scalar("tag:yaml.org,2002:str", data)
# Add the custom string representer
yaml.add_representer(str, str_presenter)
try:
data = {
"tasks": [
{
"message": "Example task 1",
}
],
"project_structure": list(project_data.keys()),
"files": [
{"name": file, "content": content}
for file, content in project_data.items()
],
}
# Write the YAML file with UTF-8 encoding
with open(yaml_file_path, "w", encoding="utf-8") as f:
yaml.dump(
data, f, default_flow_style=False, sort_keys=False, allow_unicode=True
)
logging.info(f"YAML file written to {yaml_file_path}")
except Exception as e:
logging.error(f"Failed to write YAML file: {e}")
def main():
# Set up argument parser
parser = argparse.ArgumentParser(
description="Export a project's structure and file contents to YAML."
)
parser.add_argument(
"project_path",
help="Path to the project directory. Use '.' for the current directory.",
)
args = parser.parse_args()
# Get the project path
project_path = os.path.abspath(args.project_path)
# Ensure the provided path is valid
if not os.path.isdir(project_path):
logging.error(f"Invalid project path: {project_path}")
parser.print_help()
sys.exit(1)
logging.info(f"Scanning project directory: {project_path}")
# Dynamically add the script's own name to the ignore list
script_name = os.path.basename(__file__)
# Parse .gitignore and add script name to ignore patterns
ignore_patterns = parse_gitignore(project_path)
ignore_patterns.append(script_name)
# Read project structure and contents
project_data = read_project_structure(project_path, ignore_patterns)
# Generate YAML file
generate_yaml(project_data, project_path)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
logging.warning("Process interrupted by user.")
sys.exit(1)
except Exception as e:
logging.critical(f"Unexpected error: {e}")
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment