Skip to content

Instantly share code, notes, and snippets.

@carterprince
Last active March 17, 2023 16:50
Show Gist options
  • Save carterprince/aa43e4e15b758a0bf97a924ddae76d66 to your computer and use it in GitHub Desktop.
Save carterprince/aa43e4e15b758a0bf97a924ddae76d66 to your computer and use it in GitHub Desktop.
python script that takes a given file and splits it into N new files with N GPT-3 tokens each
import sys
from transformers import GPT2Tokenizer
def split_file(file_path, output_prefix, tokens_per_file):
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
with open(file_path, 'r') as file:
text = file.read()
tokens = tokenizer.encode(text)
file_count = 1
idx = 0
while idx < len(tokens):
output_file_name = f"{output_prefix}_{file_count}.txt"
current_tokens = []
while len(current_tokens) < tokens_per_file and idx < len(tokens):
current_tokens.append(tokens[idx])
idx += 1
with open(output_file_name, 'w') as output_file:
output_file.write(tokenizer.decode(current_tokens))
file_count += 1
print(f"Successfully split the file into {file_count - 1} files.")
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: python split_file.py <input_file> <output_prefix> <tokens_per_file>")
sys.exit(1)
input_file = sys.argv[1]
output_prefix = sys.argv[2]
tokens_per_file = int(sys.argv[3])
if tokens_per_file < 1:
print("Error: tokens_per_file must be a positive integer.")
sys.exit(1)
split_file(input_file, output_prefix, tokens_per_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment