Created
January 14, 2025 08:54
-
-
Save rnsloan/03284903b554c8edd5bbb2b88080e3bc to your computer and use it in GitHub Desktop.
Generates a structured, single text file from a codebase suitable for AI tool upload
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import argparse | |
import re | |
from datetime import datetime | |
class ProjectConsolidator: | |
def __init__(self, project_dir, output_file, max_file_size_mb=1): | |
self.project_dir = project_dir | |
self.output_file = output_file | |
self.max_file_size_mb = max_file_size_mb | |
self.total_size = 0 | |
self.file_count = 0 | |
# Configuration | |
self.exclude_dirs_patterns = [r"__pycache__", r"\.idea", r"\.git", r"venv", r"node_modules"] | |
self.exclude_files_patterns = [r"\.pyc$", r"\.log$", r"\.DS_Store$"] | |
self.binary_extensions = {'.db', '.pyc', '.pkl', '.bin', '.jpg', '.png', '.exe'} | |
def should_process_file(self, file_path): | |
"""Determine if file should be processed.""" | |
return ( | |
not any(re.search(pattern, file_path) for pattern in self.exclude_files_patterns) and | |
os.path.getsize(file_path) <= (self.max_file_size_mb * 1024 * 1024) | |
) | |
def detect_language(self, file_path): | |
"""Detect file language based on extension.""" | |
ext = os.path.splitext(file_path)[1].lower() | |
language_map = { | |
'.py': 'python', | |
'.js': 'javascript', | |
'.ts': 'typescript', | |
'.java': 'java', | |
'.cpp': 'cpp', | |
'.h': 'cpp', | |
'.cs': 'csharp', | |
'.go': 'go', | |
'.rb': 'ruby', | |
'.php': 'php', | |
'.rs': 'rust' | |
} | |
return language_map.get(ext, 'text') | |
def write_file_content(self, outfile, file_path, relative_path): | |
"""Write file content with appropriate formatting.""" | |
stats = os.stat(file_path) | |
language = self.detect_language(file_path) | |
# Write file header | |
outfile.write(f"\n--- START FILE: {relative_path} ---\n") | |
outfile.write(f"File Size: {stats.st_size:,} bytes\n") | |
outfile.write(f"Last Modified: {datetime.fromtimestamp(stats.st_mtime)}\n") | |
if os.path.splitext(file_path)[1] in self.binary_extensions: | |
outfile.write(f"Content of binary file {relative_path} is omitted.\n") | |
return | |
try: | |
with open(file_path, 'r', encoding='utf-8') as infile: | |
lines = infile.readlines() | |
outfile.write(f"Lines of Code: {len(lines)}\n\n") | |
# Write content with line numbers for code files | |
if language != 'text': | |
for i, line in enumerate(lines, 1): | |
outfile.write(f"{i:4d} | {line}") | |
else: | |
outfile.writelines(lines) | |
except UnicodeDecodeError: | |
outfile.write(f"Content could not be decoded.\n") | |
outfile.write(f"\n--- END FILE: {relative_path} ---\n") | |
def write_project_summary(self, outfile): | |
"""Write project summary information.""" | |
outfile.write("--- PROJECT SUMMARY ---\n") | |
outfile.write(f"Total Files: {self.file_count}\n") | |
outfile.write(f"Total Size: {self.total_size:,} bytes\n") | |
outfile.write(f"Estimated Tokens: {(self.total_size // 4):,}\n") | |
if (self.total_size // 4) > 100000: | |
outfile.write("WARNING: Content may exceed LLM context limits\n") | |
outfile.write("--------------------\n\n") | |
def consolidate(self): | |
"""Consolidate the project files.""" | |
with open(self.output_file, 'w', encoding='utf-8') as outfile: | |
outfile.write("--- PROJECT STRUCTURE ---\n") | |
# First pass: collect file information and write structure | |
for root, dirs, files in os.walk(self.project_dir): | |
# Filter directories | |
dirs[:] = [d for d in dirs if not any(re.search(pattern, d) | |
for pattern in self.exclude_dirs_patterns)] | |
relative_path = os.path.relpath(root, self.project_dir) | |
if relative_path == ".": | |
outfile.write(" /\n") | |
else: | |
indent = " " * relative_path.count(os.sep) | |
outfile.write(f"{indent}└─ {os.path.basename(root)}/\n") | |
# Write files in this directory | |
for file in sorted(files): | |
if self.should_process_file(os.path.join(root, file)): | |
self.file_count += 1 | |
indent = " " * (relative_path.count(os.sep) + 1) | |
outfile.write(f"{indent}└─ {file}\n") | |
self.total_size += os.path.getsize(os.path.join(root, file)) | |
outfile.write("--- END PROJECT STRUCTURE ---\n\n") | |
# Write project summary | |
self.write_project_summary(outfile) | |
# Second pass: write file contents | |
for root, dirs, files in os.walk(self.project_dir): | |
dirs[:] = [d for d in dirs if not any(re.search(pattern, d) | |
for pattern in self.exclude_dirs_patterns)] | |
for file in sorted(files): | |
file_path = os.path.join(root, file) | |
if self.should_process_file(file_path): | |
relative_path = os.path.relpath(file_path, self.project_dir) | |
self.write_file_content(outfile, file_path, relative_path) | |
def main(): | |
parser = argparse.ArgumentParser(description="Consolidate a project into a single file for LLM analysis.") | |
parser.add_argument("project_dir", help="The root directory of the project") | |
parser.add_argument("output_file", help="The path to the output file") | |
parser.add_argument("--max-file-size", type=float, default=1.0, | |
help="Maximum size of individual files in MB (default: 1.0)") | |
args = parser.parse_args() | |
consolidator = ProjectConsolidator( | |
args.project_dir, | |
args.output_file, | |
args.max_file_size | |
) | |
consolidator.consolidate() | |
print(f"Project consolidated into: {args.output_file}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment