Skip to content

Instantly share code, notes, and snippets.

@albertbuchard
Last active May 20, 2024 18:39
Show Gist options
  • Save albertbuchard/991d36041e1b2c93c00ec1b8d09f2716 to your computer and use it in GitHub Desktop.
Save albertbuchard/991d36041e1b2c93c00ec1b8d09f2716 to your computer and use it in GitHub Desktop.
textrepo: A Python Script to Concatenate All Files in a Repository into a Single Text File, Ignoring Specified Patterns
#!/usr/bin/env python3
import fnmatch
import os
"""
This Python script, textrepo, concatenates all files within a specified repository into a single text file
while respecting .gitignore patterns and additional specified ignore patterns. It prints the formatted content
to both a specified output file and standard output. This is useful for reviewing all content within a repository
in a structured format, excluding unwanted files and directories such as node_modules, dist, build, and others.
Usage:
textrepo <repository_root_directory> <output_file_path>
Example:
textrepo ~/path/to/my-repo my-repo-content.txt
Ignore Patterns:
- Reads patterns from .gitignore if present
- Additional default patterns: .git, *.pyc, __pycache__, package-lock.json, node_modules, dist, build, venv
"""
def parse_gitignore(gitignore_path):
with open(gitignore_path, 'r') as f:
patterns = f.readlines()
patterns = [p.strip() for p in patterns if p.strip() and not p.startswith('#')]
return patterns
def is_ignored(file_path, ignore_patterns):
for pattern in ignore_patterns:
if fnmatch.fnmatch(file_path, pattern):
return True
# Check if any part of the path matches the ignore patterns
path_parts = file_path.split(os.sep)
for i in range(1, len(path_parts) + 1):
partial_path = os.sep.join(path_parts[:i])
if fnmatch.fnmatch(partial_path, pattern):
return True
return False
def get_file_paths(root_dir, ignore_patterns):
file_paths = []
for dirpath, dirnames, filenames in os.walk(root_dir):
# Check if the directory or any of its parent directories are ignored
if is_ignored(os.path.relpath(dirpath, root_dir), ignore_patterns):
dirnames[:] = [] # Stop os.walk from traversing this directory
continue
for filename in filenames:
file_path = os.path.relpath(os.path.join(dirpath, filename), root_dir)
if not is_ignored(file_path, ignore_patterns):
file_paths.append(file_path)
return file_paths
def read_file(file_path):
encodings = ['utf-8', 'latin-1']
for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
return f.read()
except (UnicodeDecodeError, IOError):
continue
return None
def format_file(file_path):
content = read_file(file_path)
if content is None:
return ""
title = f"Title: {os.path.basename(file_path)}\nPath: {file_path}\n{'=' * 40}\n"
return title + content + '\n\n'
def format_repository(root_dir, output_file_path):
gitignore_path = os.path.join(root_dir, '.gitignore')
ignore_patterns = parse_gitignore(gitignore_path) if os.path.exists(gitignore_path) else []
ignore_patterns += ['.git', '*.pyc', '__pycache__', 'package-lock.json', 'node_modules', 'dist', 'build',
'venv', "*/venv/*", "*/__pycache__/*", "*/.git/*", "*/.idea/*", "*/node_modules/*", "*/dist/*",
"*/build/*", "*/package-lock.json"]
ignore_patterns = list(set(ignore_patterns))
file_paths = get_file_paths(root_dir, ignore_patterns)
hierarchy = "Repository File Hierarchy:\n" + '\n'.join(file_paths) + '\n\n' + '=' * 40 + '\n\n'
formatted_files = hierarchy
for file_path in file_paths:
formatted_files += format_file(os.path.join(root_dir, file_path))
with open(output_file_path, 'w', encoding='utf-8') as output_file:
output_file.write(formatted_files)
print(formatted_files)
if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
print("Usage: textrepo <repository_root_directory> <output_file_path>")
else:
root_directory = sys.argv[1]
output_file = sys.argv[2]
expanded_path = os.path.expanduser(root_directory)
if not os.path.exists(expanded_path):
print(f"Error: {expanded_path} does not exist.")
else:
format_repository(expanded_path, output_file)
@albertbuchard
Copy link
Author

If you plan on using it often:

chmod +x textrepo.py
sudo mv textrepo.py /usr/local/bin/textrepo

Use it:

textrepo ~/my/repo my-repo-content.txt

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment