Skip to content

Instantly share code, notes, and snippets.

@thesephist
Created December 15, 2022 01:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thesephist/9a9a7e509f36eeb26686a9c303744b22 to your computer and use it in GitHub Desktop.
Save thesephist/9a9a7e509f36eeb26686a9c303744b22 to your computer and use it in GitHub Desktop.
Data prep script to fine-tune GPT-3 on my past writing
#!/usr/bin/env python
import os
import json
from tqdm import tqdm
from transformers import GPT2TokenizerFast
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
FILENAME = './thesephist.jsonl'
CORPUS_DIRS = [
'/Users/thesephist/src/www/content/posts',
'/Users/thesephist/src/dotink/content/posts',
'/Users/thesephist/src/coffee/content/note',
]
MAX_TOKENS_PER_SAMPLE = 2048 # davinci (pre-002) has a max context length of 2048, not 4096
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2-xl')
def count_tokens(s: str) -> int:
return len(tokenizer(s).input_ids)
# find all the files
files = []
for corpus_dir in CORPUS_DIRS:
for draft in os.listdir(corpus_dir):
files.append((
os.path.join(corpus_dir, draft),
corpus_dir.split(os.sep)[-2] + os.sep + draft,
))
# read all my writing and collect them up in a list
drafts = []
docs = []
for (filepath, draft) in (bar := tqdm(files)):
bar.set_description('Preprocessing files')
with open(filepath, 'r') as f:
lines = f.readlines()
# get rid of all the Markdown front matter
if lines[0] == '---\n':
lines = lines[1:]
while lines[0] != '---\n':
lines = lines[1:]
lines = lines[1:]
content = ''.join(lines).strip()
paras = [p.strip() for p in content.split('\n\n') if p.strip() != '']
# split long documents into short ones that fit in one GPT-3 context each
paras_sofar = []
for p in paras:
tokens_sofar = count_tokens('\n\n'.join(paras_sofar + [p]))
if tokens_sofar > MAX_TOKENS_PER_SAMPLE:
drafts.append(draft)
docs.append('\n\n'.join(paras_sofar))
paras_sofar = []
paras_sofar.append(p)
if len(paras_sofar) > 0:
drafts.append(draft)
docs.append('\n\n'.join(paras_sofar))
# tokenize using the GPT-2 tokenizer, which approximates GPT-3's tokenizer
# extremely well in my testing
docs_bar = tqdm(docs)
docs_bar.set_description('Tokenizing')
token_counts = [count_tokens(doc) for doc in docs_bar]
finetune_docs = [{ 'prompt': '', 'completion': text.strip() } for text in docs]
# report some stats
count = len(finetune_docs)
size = len(''.join(docs))
tokens_estimate = sum(token_counts)
mean_tokens = tokens_estimate / len(docs)
print(f'''Creating finetuning JSONL data with:
{count} drafts
{size//1000} KB
{tokens_estimate:.2f} tokens ({mean_tokens:.2f}tok/sample)''')
print(f'''At this size, training for 4 epochs will cost:
ada ${0.0004 * tokens_estimate / 1000 * 4:.2f}
babbage ${0.0006 * tokens_estimate / 1000 * 4:.2f}
curie ${0.003 * tokens_estimate / 1000 * 4:.2f}
davinci ${0.03 * tokens_estimate / 1000 * 4:.2f}''')
# write dataset JSONL file
with open(FILENAME, 'w+') as f:
for doc in finetune_docs:
json.dump(doc, f)
f.write('\n')
#!/bin/bash
# environment
export OPENAI_API_KEY=sk-XXXX
export FILENAME=thesephist.jsonl
export MODEL_NAME=davinci
# train
source venv/bin/activate
openai api fine_tunes.create -t $FILENAME -m $MODEL_NAME --suffix "text-davinci-thesephist"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment