|
import os |
|
import subprocess |
|
import argparse |
|
import fnmatch |
|
from urllib.parse import urlparse |
|
|
|
EXCLUDE_DIRS = ['node_modules', 'dist', 'build', '.git', 'venv'] |
|
SOURCE_CODE_EXTENSIONS = ['.py', '.md','.mdx', '.js', '.java', '.cpp', '.c', '.h', '.cs', '.rb', '.go', '.rs', '.ts', '.php', '.html', '.css'] |
|
|
|
def clone_repo(url, clone_dir): |
|
""" |
|
Clones a GitHub repository to a specified directory. |
|
""" |
|
subprocess.run(['git', 'clone', url, clone_dir], check=True) |
|
|
|
def filter_and_print_files(root_dir, filename_filter=None, subfolder_filter=None): |
|
""" |
|
Filters and prints the content of source code files in the cloned repository. |
|
""" |
|
for root, dirs, files in os.walk(root_dir): |
|
dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS] |
|
if subfolder_filter and not root.startswith(os.path.join(root_dir, subfolder_filter)): |
|
continue |
|
|
|
for file in files: |
|
if not any(file.endswith(ext) for ext in SOURCE_CODE_EXTENSIONS): |
|
continue |
|
if filename_filter and not fnmatch.fnmatch(file, filename_filter): |
|
continue |
|
|
|
file_path = os.path.join(root, file) |
|
print(f"File: {file_path}") |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
print(f.read()) |
|
print("\n" + "="*80 + "\n") |
|
|
|
def parse_github_url(url): |
|
""" |
|
Parses the GitHub URL to extract owner, repo, branch, and subfolder. |
|
""" |
|
parsed_url = urlparse(url) |
|
path_parts = parsed_url.path.strip('/').split('/') |
|
|
|
if len(path_parts) < 2: |
|
raise ValueError("Invalid GitHub URL. Expected format: https://github.com/owner/repo") |
|
|
|
owner = path_parts[0] |
|
repo = path_parts[1] |
|
branch = 'main' |
|
subfolder = '' |
|
|
|
if 'tree' in path_parts and len(path_parts) > 3: |
|
branch = path_parts[path_parts.index('tree') + 1] |
|
subfolder = '/'.join(path_parts[path_parts.index('tree') + 2:]) |
|
|
|
repo_url = f"https://github.com/{owner}/{repo}.git" |
|
|
|
return repo_url, branch, subfolder |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description='Clone a GitHub repository and print the content of source code files.') |
|
parser.add_argument('url', type=str, help='GitHub repository URL') |
|
parser.add_argument('--filter', type=str, help='Filename filter (optional)', default=None) |
|
parser.add_argument('--dest', type=str, help='Destination directory (optional)', default='cloned_repo') |
|
|
|
args = parser.parse_args() |
|
|
|
repo_url, branch, subfolder = parse_github_url(args.url) |
|
clone_repo(repo_url, args.dest) |
|
filter_and_print_files(args.dest, args.filter, subfolder) |
|
|
|
if __name__ == "__main__": |
|
main() |