shaayaansayed/flatten_repo.py

## flatten_repo.py
"""
GitHub Repo Flattener

This CLI tool flattens a GitHub repository by merging its files into a single text file. Useful for feeding into LLMs to ask questions about the codebase.

Features:

- Flatten a GitHub repository into a single text file
- Specify which file types to include (e.g., skip certain files like .md)
- List all file types in the repository before flattening (helpful for large repositories)

---

Requirements:

- python-dotenv
- PyGithub
- Create a .env file in the same directory as the script and add your GitHub access token

```
GITHUB_TOKEN=your_github_access_token
```

---

Usage:

1. List file types in a repository:

python flatten_repo.py list {org}/{repo_name}

example: `python flatten_repo.py list infiniflow/ragflow`
output: File types in infiniflow/ragflow: .con, .cs, .cud, .j, .jp, .js, .jso, .les, .m, .p, .pe, .pn, .re, .s, .scratc, .sq, .sv, .t, .ts, .tx, .woff, .yam, .ym

2. Flatten a repo:

python flatten_repo.py flatten {org}/{repo_name}

example: `python flatten_repo.py flatten infiniflow/ragflow`

3. Flatten a repo with specific file types

python flatten_repo.py flatten {org}/{repo_name} --file-types file_type1 file_type2 ...

example: `python flatten_repo.py flatten infiniflow/ragflow --file-types py ts`
"""


import argparse
from datetime import datetime
import os
import base64
import logging
from github import Github
from dotenv import load_dotenv

load_dotenv()

GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

gh = Github(GITHUB_TOKEN)

def list_file_types(tree):
    file_types = set()
    for element in tree.tree :
        _, ext = os.path.splitext(element.path)
        file_types.add(ext[:-1]) # drop the '.' in the extension
    file_types.discard("")
    return sorted(list(file_types))

def flatten_repo(repo, tree, file_types=None):
    collected_files = []

    for item in tree.tree:
        if item.type == "blob":
            if file_types is None or any(item.path.endswith(ft) for ft in file_types):
                blob = repo.get_git_blob(item.sha)

                try:
                    content_str = ""
                    if hasattr(blob, "encoding") and blob.encoding == "base64":
                        content_str = base64.b64decode(blob.content).decode("utf-8")
                    else:
                        content_str = blob.content.decode("utf-8")
                except (UnicodeDecodeError, TypeError):
                    content_str = "[Content not decodable]"

                formatted_content = f"\n'''--- {item.path} ---\n{content_str}\n'''"
                collected_files.append(formatted_content)
    return write_to_file(repo, collected_files)


def write_to_file(repo, files_data):
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    file_name = f"{repo.full_name.replace('/', '_')}_{timestamp}.txt"
    with open(file_name, "w", encoding='utf-8') as f:
        f.write(f"*GitHub Repository \"{repo.full_name}\"*\n")
        for file_data in files_data:
            f.write(file_data)
    return file_name

def main():
    parser = argparse.ArgumentParser(description="CLI tool to flatten a GitHub repo into a text file")
    subparsers = parser.add_subparsers(dest="command", required=True)

    # Subcommand for listing file types
    parser_list = subparsers.add_parser("list", help="List all file types in the repo")
    parser_list.add_argument("repo", type=str, help="Path to the GitHub repo")

    # Subcommand for outputting code to a text file
    parser_output = subparsers.add_parser("flatten", help="Output code from the repo to a text file")
    parser_output.add_argument("repo", type=str, help="Name of GitHub repo")
    parser_output.add_argument(
        "--file-types", nargs="*", type=str, help="Optional list of file types to include"
    )

    args = parser.parse_args()

    try:
        repo = gh.get_repo(args.repo)
        main_ref = repo.get_git_ref("heads/main")
        sha = main_ref.object.sha
        tree = repo.get_git_tree(sha, recursive=True)
    except Exception as e:
        logging.error("Error accessing repository: %s", e)
        return

    if args.command == "list":
        file_types = list_file_types(tree)
        print("File types in %s: %s", repo.full_name, ', '.join(file_types))
    elif args.command == "flatten":
        output_file = flatten_repo(repo, tree, args.file_types)
        print("Repo saved to %s", output_file)
    else:
        parser.print_help()

if __name__ == "__main__":
    main()
	"""
	GitHub Repo Flattener

	This CLI tool flattens a GitHub repository by merging its files into a single text file. Useful for feeding into LLMs to ask questions about the codebase.

	Features:

	- Flatten a GitHub repository into a single text file
	- Specify which file types to include (e.g., skip certain files like .md)
	- List all file types in the repository before flattening (helpful for large repositories)

	---

	Requirements:

	- python-dotenv
	- PyGithub
	- Create a .env file in the same directory as the script and add your GitHub access token

	```
	GITHUB_TOKEN=your_github_access_token
	```

	---

	Usage:

	1. List file types in a repository:

	python flatten_repo.py list {org}/{repo_name}

	example: `python flatten_repo.py list infiniflow/ragflow`
	output: File types in infiniflow/ragflow: .con, .cs, .cud, .j, .jp, .js, .jso, .les, .m, .p, .pe, .pn, .re, .s, .scratc, .sq, .sv, .t, .ts, .tx, .woff, .yam, .ym

	2. Flatten a repo:

	python flatten_repo.py flatten {org}/{repo_name}

	example: `python flatten_repo.py flatten infiniflow/ragflow`

	3. Flatten a repo with specific file types

	python flatten_repo.py flatten {org}/{repo_name} --file-types file_type1 file_type2 ...

	example: `python flatten_repo.py flatten infiniflow/ragflow --file-types py ts`
	"""


	import argparse
	from datetime import datetime
	import os
	import base64
	import logging
	from github import Github
	from dotenv import load_dotenv

	load_dotenv()

	GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

	gh = Github(GITHUB_TOKEN)

	def list_file_types(tree):
	file_types = set()
	for element in tree.tree :
	_, ext = os.path.splitext(element.path)
	file_types.add(ext[:-1]) # drop the '.' in the extension
	file_types.discard("")
	return sorted(list(file_types))

	def flatten_repo(repo, tree, file_types=None):
	collected_files = []

	for item in tree.tree:
	if item.type == "blob":
	if file_types is None or any(item.path.endswith(ft) for ft in file_types):
	blob = repo.get_git_blob(item.sha)

	try:
	content_str = ""
	if hasattr(blob, "encoding") and blob.encoding == "base64":
	content_str = base64.b64decode(blob.content).decode("utf-8")
	else:
	content_str = blob.content.decode("utf-8")
	except (UnicodeDecodeError, TypeError):
	content_str = "[Content not decodable]"

	formatted_content = f"\n'''--- {item.path} ---\n{content_str}\n'''"
	collected_files.append(formatted_content)
	return write_to_file(repo, collected_files)


	def write_to_file(repo, files_data):
	timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
	file_name = f"{repo.full_name.replace('/', '_')}_{timestamp}.txt"
	with open(file_name, "w", encoding='utf-8') as f:
	f.write(f"GitHub Repository \"{repo.full_name}\"\n")
	for file_data in files_data:
	f.write(file_data)
	return file_name

	def main():
	parser = argparse.ArgumentParser(description="CLI tool to flatten a GitHub repo into a text file")
	subparsers = parser.add_subparsers(dest="command", required=True)

	# Subcommand for listing file types
	parser_list = subparsers.add_parser("list", help="List all file types in the repo")
	parser_list.add_argument("repo", type=str, help="Path to the GitHub repo")

	# Subcommand for outputting code to a text file
	parser_output = subparsers.add_parser("flatten", help="Output code from the repo to a text file")
	parser_output.add_argument("repo", type=str, help="Name of GitHub repo")
	parser_output.add_argument(
	"--file-types", nargs="*", type=str, help="Optional list of file types to include"
	)

	args = parser.parse_args()

	try:
	repo = gh.get_repo(args.repo)
	main_ref = repo.get_git_ref("heads/main")
	sha = main_ref.object.sha
	tree = repo.get_git_tree(sha, recursive=True)
	except Exception as e:
	logging.error("Error accessing repository: %s", e)
	return

	if args.command == "list":
	file_types = list_file_types(tree)
	print("File types in %s: %s", repo.full_name, ', '.join(file_types))
	elif args.command == "flatten":
	output_file = flatten_repo(repo, tree, args.file_types)
	print("Repo saved to %s", output_file)
	else:
	parser.print_help()

	if __name__ == "__main__":
	main()