Skip to content

Instantly share code, notes, and snippets.

@shaayaansayed
Created April 24, 2024 21:06
Show Gist options
  • Save shaayaansayed/e1efcefeb5c331cfb5e8347957a28c40 to your computer and use it in GitHub Desktop.
Save shaayaansayed/e1efcefeb5c331cfb5e8347957a28c40 to your computer and use it in GitHub Desktop.
CLI tool to flatten a repo by merging its files into a single text file. Useful for feeding into LLMs to ask questions about the codebase.
"""
GitHub Repo Flattener
This CLI tool flattens a GitHub repository by merging its files into a single text file. Useful for feeding into LLMs to ask questions about the codebase.
Features:
- Flatten a GitHub repository into a single text file
- Specify which file types to include (e.g., skip certain files like .md)
- List all file types in the repository before flattening (helpful for large repositories)
---
Requirements:
- python-dotenv
- PyGithub
- Create a .env file in the same directory as the script and add your GitHub access token
```
GITHUB_TOKEN=your_github_access_token
```
---
Usage:
1. List file types in a repository:
python flatten_repo.py list {org}/{repo_name}
example: `python flatten_repo.py list infiniflow/ragflow`
output: File types in infiniflow/ragflow: .con, .cs, .cud, .j, .jp, .js, .jso, .les, .m, .p, .pe, .pn, .re, .s, .scratc, .sq, .sv, .t, .ts, .tx, .woff, .yam, .ym
2. Flatten a repo:
python flatten_repo.py flatten {org}/{repo_name}
example: `python flatten_repo.py flatten infiniflow/ragflow`
3. Flatten a repo with specific file types
python flatten_repo.py flatten {org}/{repo_name} --file-types file_type1 file_type2 ...
example: `python flatten_repo.py flatten infiniflow/ragflow --file-types py ts`
"""
import argparse
from datetime import datetime
import os
import base64
import logging
from github import Github
from dotenv import load_dotenv
load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
gh = Github(GITHUB_TOKEN)
def list_file_types(tree):
file_types = set()
for element in tree.tree :
_, ext = os.path.splitext(element.path)
file_types.add(ext[:-1]) # drop the '.' in the extension
file_types.discard("")
return sorted(list(file_types))
def flatten_repo(repo, tree, file_types=None):
collected_files = []
for item in tree.tree:
if item.type == "blob":
if file_types is None or any(item.path.endswith(ft) for ft in file_types):
blob = repo.get_git_blob(item.sha)
try:
content_str = ""
if hasattr(blob, "encoding") and blob.encoding == "base64":
content_str = base64.b64decode(blob.content).decode("utf-8")
else:
content_str = blob.content.decode("utf-8")
except (UnicodeDecodeError, TypeError):
content_str = "[Content not decodable]"
formatted_content = f"\n'''--- {item.path} ---\n{content_str}\n'''"
collected_files.append(formatted_content)
return write_to_file(repo, collected_files)
def write_to_file(repo, files_data):
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
file_name = f"{repo.full_name.replace('/', '_')}_{timestamp}.txt"
with open(file_name, "w", encoding='utf-8') as f:
f.write(f"*GitHub Repository \"{repo.full_name}\"*\n")
for file_data in files_data:
f.write(file_data)
return file_name
def main():
parser = argparse.ArgumentParser(description="CLI tool to flatten a GitHub repo into a text file")
subparsers = parser.add_subparsers(dest="command", required=True)
# Subcommand for listing file types
parser_list = subparsers.add_parser("list", help="List all file types in the repo")
parser_list.add_argument("repo", type=str, help="Path to the GitHub repo")
# Subcommand for outputting code to a text file
parser_output = subparsers.add_parser("flatten", help="Output code from the repo to a text file")
parser_output.add_argument("repo", type=str, help="Name of GitHub repo")
parser_output.add_argument(
"--file-types", nargs="*", type=str, help="Optional list of file types to include"
)
args = parser.parse_args()
try:
repo = gh.get_repo(args.repo)
main_ref = repo.get_git_ref("heads/main")
sha = main_ref.object.sha
tree = repo.get_git_tree(sha, recursive=True)
except Exception as e:
logging.error("Error accessing repository: %s", e)
return
if args.command == "list":
file_types = list_file_types(tree)
print("File types in %s: %s", repo.full_name, ', '.join(file_types))
elif args.command == "flatten":
output_file = flatten_repo(repo, tree, args.file_types)
print("Repo saved to %s", output_file)
else:
parser.print_help()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment