Last active
January 5, 2025 18:59
-
-
Save ph33nx/12ce315ef6dbcf9ef1d01f5371af4a3d to your computer and use it in GitHub Desktop.
project2yaml.py - Python script that Exports Project Structure and File Contents to YAML for LLM, ChatGPT, Agent Based Workflows. Scans a project's directory structure and exports the file hierarchy and contents into a clean, human-readable YAML file. This script is designed to simplify AI workflows, including those leveraging ChatGPT, LLMs, and…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
project2yaml.py | |
Python script to scan a project's directory structure and export it to a YAML file. | |
Respects .gitignore rules if present, handles subdirectories, and preserves file contents. | |
Author: ph33nx | |
GitHub: https://github.com/ph33nx/ | |
Features: | |
- Scans all files in a directory and subdirectories. | |
- Excludes files/directories specified in .gitignore. | |
- Always ignores `.git` and hidden files/folders (e.g., `.env`, `.vscode`). | |
- Skips binary files like images and executables. | |
- Outputs a well-structured YAML file with project structure, file contents, and a placeholder tasks section. | |
- Dynamically installs PyYAML if it's not already installed. | |
- Enforces path input; shows help if no path is provided. | |
Usage: | |
python3 project2yaml.py [project_path] | |
Example: | |
python3 project2yaml.py . | |
""" | |
import os | |
import fnmatch | |
import argparse | |
import logging | |
import subprocess | |
import sys | |
# Set up logging | |
logging.basicConfig( | |
format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO | |
) | |
# Attempt to import PyYAML | |
try: | |
import yaml | |
except ModuleNotFoundError: | |
logging.warning("PyYAML module not found. Attempting to install it...") | |
try: | |
subprocess.check_call([sys.executable, "-m", "pip", "install", "pyyaml"]) | |
import yaml | |
logging.info("PyYAML successfully installed!") | |
except Exception as e: | |
logging.critical(f"Failed to install PyYAML: {e}") | |
sys.exit(1) | |
def parse_gitignore(base_path): | |
""" | |
Parse the .gitignore file in the project root and extract ignore patterns. | |
If .gitignore doesn't exist, return an empty list. | |
""" | |
gitignore_path = os.path.join(base_path, ".gitignore") | |
ignore_patterns = [] | |
if os.path.exists(gitignore_path): | |
logging.info(f"Found .gitignore at {gitignore_path}. Parsing...") | |
try: | |
with open(gitignore_path, "r") as f: | |
ignore_patterns = [ | |
line.strip() | |
for line in f | |
if line.strip() and not line.startswith("#") | |
] | |
except Exception as e: | |
logging.error(f"Error reading .gitignore: {e}") | |
else: | |
logging.info(".gitignore not found. Proceeding without ignoring files.") | |
# Always ignore `.git` and hidden files/folders | |
ignore_patterns.extend([".git", ".*"]) | |
return ignore_patterns | |
def is_ignored(path, base_path, ignore_patterns): | |
""" | |
Check if a file or directory matches any ignore patterns from .gitignore. | |
Handles both files and directories. | |
""" | |
relative_path = os.path.relpath(path, base_path).replace( | |
"\\", "/" | |
) # Normalize paths | |
for pattern in ignore_patterns: | |
if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch( | |
relative_path, f"{pattern}/" | |
): | |
logging.debug(f"Ignored: {relative_path} (matches pattern: {pattern})") | |
return True | |
return False | |
def is_binary(file_path): | |
""" | |
Check if a file is binary by reading its first 1024 bytes. | |
""" | |
try: | |
with open(file_path, "rb") as f: | |
chunk = f.read(1024) | |
return b"\0" in chunk # Check for null byte, common in binary files | |
except Exception as e: | |
logging.error(f"Error checking if file is binary: {file_path}. {e}") | |
return True # Treat unreadable files as binary | |
def read_project_structure(base_path, ignore_patterns): | |
""" | |
Traverse the project directory, read file names and contents, and ignore files listed in .gitignore or binary files. | |
""" | |
project_data = {} | |
for root, dirs, files in os.walk(base_path): | |
# Filter out ignored directories | |
dirs[:] = [ | |
d | |
for d in dirs | |
if not is_ignored(os.path.join(root, d), base_path, ignore_patterns) | |
] | |
for file in files: | |
full_file_path = os.path.join(root, file) | |
# Skip ignored files | |
if is_ignored(full_file_path, base_path, ignore_patterns): | |
continue | |
# Skip binary files | |
if is_binary(full_file_path): | |
logging.info(f"Skipping binary file: {full_file_path}") | |
continue | |
# Read file contents | |
relative_path = os.path.relpath(full_file_path, base_path) | |
try: | |
with open(full_file_path, "r", encoding="utf-8", errors="ignore") as f: | |
content = f.read() | |
logging.info(f"Read file: {relative_path}") | |
except Exception as e: | |
logging.error(f"Error reading file {relative_path}: {e}") | |
content = None # Skip unreadable files | |
project_data[relative_path] = content | |
return project_data | |
def generate_yaml(project_data, output_path): | |
""" | |
Write the project structure, file contents, and placeholder tasks to a YAML file in the root directory. | |
""" | |
yaml_file_path = os.path.join(output_path, "project_structure.yaml") | |
# Custom representer to use block style for multi-line strings | |
def str_presenter(dumper, data): | |
if "\n" in data: # Use block style if the string contains newlines | |
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") | |
return dumper.represent_scalar("tag:yaml.org,2002:str", data) | |
# Add the custom string representer | |
yaml.add_representer(str, str_presenter) | |
try: | |
data = { | |
"tasks": [ | |
{ | |
"message": "Example task 1", | |
} | |
], | |
"project_structure": list(project_data.keys()), | |
"files": [ | |
{"name": file, "content": content} | |
for file, content in project_data.items() | |
], | |
} | |
# Write the YAML file with UTF-8 encoding | |
with open(yaml_file_path, "w", encoding="utf-8") as f: | |
yaml.dump( | |
data, f, default_flow_style=False, sort_keys=False, allow_unicode=True | |
) | |
logging.info(f"YAML file written to {yaml_file_path}") | |
except Exception as e: | |
logging.error(f"Failed to write YAML file: {e}") | |
def main(): | |
# Set up argument parser | |
parser = argparse.ArgumentParser( | |
description="Export a project's structure and file contents to YAML." | |
) | |
parser.add_argument( | |
"project_path", | |
help="Path to the project directory. Use '.' for the current directory.", | |
) | |
args = parser.parse_args() | |
# Get the project path | |
project_path = os.path.abspath(args.project_path) | |
# Ensure the provided path is valid | |
if not os.path.isdir(project_path): | |
logging.error(f"Invalid project path: {project_path}") | |
parser.print_help() | |
sys.exit(1) | |
logging.info(f"Scanning project directory: {project_path}") | |
# Dynamically add the script's own name to the ignore list | |
script_name = os.path.basename(__file__) | |
# Parse .gitignore and add script name to ignore patterns | |
ignore_patterns = parse_gitignore(project_path) | |
ignore_patterns.append(script_name) | |
# Read project structure and contents | |
project_data = read_project_structure(project_path, ignore_patterns) | |
# Generate YAML file | |
generate_yaml(project_data, project_path) | |
if __name__ == "__main__": | |
try: | |
main() | |
except KeyboardInterrupt: | |
logging.warning("Process interrupted by user.") | |
sys.exit(1) | |
except Exception as e: | |
logging.critical(f"Unexpected error: {e}") | |
sys.exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment