Created
April 24, 2024 21:06
-
-
Save shaayaansayed/e1efcefeb5c331cfb5e8347957a28c40 to your computer and use it in GitHub Desktop.
CLI tool to flatten a repo by merging its files into a single text file. Useful for feeding into LLMs to ask questions about the codebase.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
GitHub Repo Flattener | |
This CLI tool flattens a GitHub repository by merging its files into a single text file. Useful for feeding into LLMs to ask questions about the codebase. | |
Features: | |
- Flatten a GitHub repository into a single text file | |
- Specify which file types to include (e.g., skip certain files like .md) | |
- List all file types in the repository before flattening (helpful for large repositories) | |
--- | |
Requirements: | |
- python-dotenv | |
- PyGithub | |
- Create a .env file in the same directory as the script and add your GitHub access token | |
``` | |
GITHUB_TOKEN=your_github_access_token | |
``` | |
--- | |
Usage: | |
1. List file types in a repository: | |
python flatten_repo.py list {org}/{repo_name} | |
example: `python flatten_repo.py list infiniflow/ragflow` | |
output: File types in infiniflow/ragflow: .con, .cs, .cud, .j, .jp, .js, .jso, .les, .m, .p, .pe, .pn, .re, .s, .scratc, .sq, .sv, .t, .ts, .tx, .woff, .yam, .ym | |
2. Flatten a repo: | |
python flatten_repo.py flatten {org}/{repo_name} | |
example: `python flatten_repo.py flatten infiniflow/ragflow` | |
3. Flatten a repo with specific file types | |
python flatten_repo.py flatten {org}/{repo_name} --file-types file_type1 file_type2 ... | |
example: `python flatten_repo.py flatten infiniflow/ragflow --file-types py ts` | |
""" | |
import argparse | |
from datetime import datetime | |
import os | |
import base64 | |
import logging | |
from github import Github | |
from dotenv import load_dotenv | |
load_dotenv() | |
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") | |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
gh = Github(GITHUB_TOKEN) | |
def list_file_types(tree): | |
file_types = set() | |
for element in tree.tree : | |
_, ext = os.path.splitext(element.path) | |
file_types.add(ext[:-1]) # drop the '.' in the extension | |
file_types.discard("") | |
return sorted(list(file_types)) | |
def flatten_repo(repo, tree, file_types=None): | |
collected_files = [] | |
for item in tree.tree: | |
if item.type == "blob": | |
if file_types is None or any(item.path.endswith(ft) for ft in file_types): | |
blob = repo.get_git_blob(item.sha) | |
try: | |
content_str = "" | |
if hasattr(blob, "encoding") and blob.encoding == "base64": | |
content_str = base64.b64decode(blob.content).decode("utf-8") | |
else: | |
content_str = blob.content.decode("utf-8") | |
except (UnicodeDecodeError, TypeError): | |
content_str = "[Content not decodable]" | |
formatted_content = f"\n'''--- {item.path} ---\n{content_str}\n'''" | |
collected_files.append(formatted_content) | |
return write_to_file(repo, collected_files) | |
def write_to_file(repo, files_data): | |
timestamp = datetime.now().strftime("%Y%m%d%H%M%S") | |
file_name = f"{repo.full_name.replace('/', '_')}_{timestamp}.txt" | |
with open(file_name, "w", encoding='utf-8') as f: | |
f.write(f"*GitHub Repository \"{repo.full_name}\"*\n") | |
for file_data in files_data: | |
f.write(file_data) | |
return file_name | |
def main(): | |
parser = argparse.ArgumentParser(description="CLI tool to flatten a GitHub repo into a text file") | |
subparsers = parser.add_subparsers(dest="command", required=True) | |
# Subcommand for listing file types | |
parser_list = subparsers.add_parser("list", help="List all file types in the repo") | |
parser_list.add_argument("repo", type=str, help="Path to the GitHub repo") | |
# Subcommand for outputting code to a text file | |
parser_output = subparsers.add_parser("flatten", help="Output code from the repo to a text file") | |
parser_output.add_argument("repo", type=str, help="Name of GitHub repo") | |
parser_output.add_argument( | |
"--file-types", nargs="*", type=str, help="Optional list of file types to include" | |
) | |
args = parser.parse_args() | |
try: | |
repo = gh.get_repo(args.repo) | |
main_ref = repo.get_git_ref("heads/main") | |
sha = main_ref.object.sha | |
tree = repo.get_git_tree(sha, recursive=True) | |
except Exception as e: | |
logging.error("Error accessing repository: %s", e) | |
return | |
if args.command == "list": | |
file_types = list_file_types(tree) | |
print("File types in %s: %s", repo.full_name, ', '.join(file_types)) | |
elif args.command == "flatten": | |
output_file = flatten_repo(repo, tree, args.file_types) | |
print("Repo saved to %s", output_file) | |
else: | |
parser.print_help() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment