Skip to content

Instantly share code, notes, and snippets.

@tensojka
Created March 13, 2024 13:29
Show Gist options
  • Save tensojka/a63369a00c94039e543d0b27e2ea1f9b to your computer and use it in GitHub Desktop.
Save tensojka/a63369a00c94039e543d0b27e2ea1f9b to your computer and use it in GitHub Desktop.
This script merges files in a specific directory into one file, prefixing each with its name and separating the files with ---. The purpose is to be able to then feed a nearly whole codebase to a LLM easily.
import os
import sys
import argparse
import re
import pathspec
import subprocess
def load_gitignore(root_dir):
gitignore = os.path.join(root_dir, '.gitignore')
try:
with open(gitignore, 'r') as file:
spec = pathspec.PathSpec.from_lines('gitwildmatch', file)
except IOError:
print(f"Warning: Unable to read .gitignore in {root_dir}, ignoring .gitignore rules.")
spec = pathspec.PathSpec.from_lines('gitwildmatch', [])
return spec
def concatenate_files(root_dir, output_file_path, file_suffixes, startswith, poe_flag):
if not os.path.isdir(root_dir):
print(f"Error: The specified directory '{root_dir}' does not exist.")
sys.exit(1)
if not file_suffixes:
print("Warning: No file suffixes specified.")
ignore_spec = load_gitignore(root_dir)
url_pattern = re.compile(r'https://')
try:
with open(output_file_path, 'w') as output_file:
for root, dirs, files in os.walk(root_dir):
dirs[:] = [d for d in dirs if not d.startswith('.git')]
for file in files:
if file_suffixes and not any(file.endswith(suffix) for suffix in file_suffixes):
continue
if startswith and not any(file.startswith(start) for start in startswith):
continue
file_path = os.path.join(root, file)
if ignore_spec.match_file(file_path):
continue
relative_path = os.path.relpath(file_path, root_dir)
output_file.write(f'---\n{relative_path}\n')
try:
with open(file_path, 'r', errors='ignore') as input_file:
for line in input_file:
if poe_flag:
line = url_pattern.sub('https-//', line)
output_file.write(line)
output_file.write('\n')
except IOError:
print(f"Warning: Failed to read file {file_path}")
except IOError:
print(f"Error: Unable to write to output file '{output_file_path}'")
sys.exit(1)
try:
result = subprocess.run(['claude_token_counter', output_file_path], capture_output=True, text=True, check=True)
token_count = result.stdout.strip()
print(token_count)
except subprocess.CalledProcessError as e:
print(f"Error: Failed to run token counter: {e}")
def main():
parser = argparse.ArgumentParser(description="Concatenate files in a directory, excluding those specified in .gitignore")
parser.add_argument('root_dir', type=str, help='Root directory to search for files')
parser.add_argument('output_file', type=str, help='Path to the output file')
parser.add_argument('--filetype', type=str, nargs='+', default=[], help='File extensions to filter for concatenation (accepts multiple)')
parser.add_argument('--startswith', type=str, nargs='+', default=[], help='File name prefixes to filter for concatenation (accepts multiple)')
parser.add_argument('--poe', action='store_true', help='Enable Poe URL filtering')
args = parser.parse_args()
concatenate_files(args.root_dir, args.output_file, args.filetype, args.startswith, args.poe)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment