Skip to content

Instantly share code, notes, and snippets.

@rnsloan
Created January 14, 2025 08:54
Show Gist options
  • Save rnsloan/03284903b554c8edd5bbb2b88080e3bc to your computer and use it in GitHub Desktop.
Save rnsloan/03284903b554c8edd5bbb2b88080e3bc to your computer and use it in GitHub Desktop.
Generates a structured, single text file from a codebase suitable for AI tool upload
import os
import argparse
import re
from datetime import datetime
class ProjectConsolidator:
def __init__(self, project_dir, output_file, max_file_size_mb=1):
self.project_dir = project_dir
self.output_file = output_file
self.max_file_size_mb = max_file_size_mb
self.total_size = 0
self.file_count = 0
# Configuration
self.exclude_dirs_patterns = [r"__pycache__", r"\.idea", r"\.git", r"venv", r"node_modules"]
self.exclude_files_patterns = [r"\.pyc$", r"\.log$", r"\.DS_Store$"]
self.binary_extensions = {'.db', '.pyc', '.pkl', '.bin', '.jpg', '.png', '.exe'}
def should_process_file(self, file_path):
"""Determine if file should be processed."""
return (
not any(re.search(pattern, file_path) for pattern in self.exclude_files_patterns) and
os.path.getsize(file_path) <= (self.max_file_size_mb * 1024 * 1024)
)
def detect_language(self, file_path):
"""Detect file language based on extension."""
ext = os.path.splitext(file_path)[1].lower()
language_map = {
'.py': 'python',
'.js': 'javascript',
'.ts': 'typescript',
'.java': 'java',
'.cpp': 'cpp',
'.h': 'cpp',
'.cs': 'csharp',
'.go': 'go',
'.rb': 'ruby',
'.php': 'php',
'.rs': 'rust'
}
return language_map.get(ext, 'text')
def write_file_content(self, outfile, file_path, relative_path):
"""Write file content with appropriate formatting."""
stats = os.stat(file_path)
language = self.detect_language(file_path)
# Write file header
outfile.write(f"\n--- START FILE: {relative_path} ---\n")
outfile.write(f"File Size: {stats.st_size:,} bytes\n")
outfile.write(f"Last Modified: {datetime.fromtimestamp(stats.st_mtime)}\n")
if os.path.splitext(file_path)[1] in self.binary_extensions:
outfile.write(f"Content of binary file {relative_path} is omitted.\n")
return
try:
with open(file_path, 'r', encoding='utf-8') as infile:
lines = infile.readlines()
outfile.write(f"Lines of Code: {len(lines)}\n\n")
# Write content with line numbers for code files
if language != 'text':
for i, line in enumerate(lines, 1):
outfile.write(f"{i:4d} | {line}")
else:
outfile.writelines(lines)
except UnicodeDecodeError:
outfile.write(f"Content could not be decoded.\n")
outfile.write(f"\n--- END FILE: {relative_path} ---\n")
def write_project_summary(self, outfile):
"""Write project summary information."""
outfile.write("--- PROJECT SUMMARY ---\n")
outfile.write(f"Total Files: {self.file_count}\n")
outfile.write(f"Total Size: {self.total_size:,} bytes\n")
outfile.write(f"Estimated Tokens: {(self.total_size // 4):,}\n")
if (self.total_size // 4) > 100000:
outfile.write("WARNING: Content may exceed LLM context limits\n")
outfile.write("--------------------\n\n")
def consolidate(self):
"""Consolidate the project files."""
with open(self.output_file, 'w', encoding='utf-8') as outfile:
outfile.write("--- PROJECT STRUCTURE ---\n")
# First pass: collect file information and write structure
for root, dirs, files in os.walk(self.project_dir):
# Filter directories
dirs[:] = [d for d in dirs if not any(re.search(pattern, d)
for pattern in self.exclude_dirs_patterns)]
relative_path = os.path.relpath(root, self.project_dir)
if relative_path == ".":
outfile.write(" /\n")
else:
indent = " " * relative_path.count(os.sep)
outfile.write(f"{indent}└─ {os.path.basename(root)}/\n")
# Write files in this directory
for file in sorted(files):
if self.should_process_file(os.path.join(root, file)):
self.file_count += 1
indent = " " * (relative_path.count(os.sep) + 1)
outfile.write(f"{indent}└─ {file}\n")
self.total_size += os.path.getsize(os.path.join(root, file))
outfile.write("--- END PROJECT STRUCTURE ---\n\n")
# Write project summary
self.write_project_summary(outfile)
# Second pass: write file contents
for root, dirs, files in os.walk(self.project_dir):
dirs[:] = [d for d in dirs if not any(re.search(pattern, d)
for pattern in self.exclude_dirs_patterns)]
for file in sorted(files):
file_path = os.path.join(root, file)
if self.should_process_file(file_path):
relative_path = os.path.relpath(file_path, self.project_dir)
self.write_file_content(outfile, file_path, relative_path)
def main():
parser = argparse.ArgumentParser(description="Consolidate a project into a single file for LLM analysis.")
parser.add_argument("project_dir", help="The root directory of the project")
parser.add_argument("output_file", help="The path to the output file")
parser.add_argument("--max-file-size", type=float, default=1.0,
help="Maximum size of individual files in MB (default: 1.0)")
args = parser.parse_args()
consolidator = ProjectConsolidator(
args.project_dir,
args.output_file,
args.max_file_size
)
consolidator.consolidate()
print(f"Project consolidated into: {args.output_file}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment