thesephist/prepare_data.py

## prepare_data.py
#!/usr/bin/env python

import os
import json
from tqdm import tqdm
from transformers import GPT2TokenizerFast

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

FILENAME = './thesephist.jsonl'
CORPUS_DIRS = [
    '/Users/thesephist/src/www/content/posts',
    '/Users/thesephist/src/dotink/content/posts',
    '/Users/thesephist/src/coffee/content/note',
]
MAX_TOKENS_PER_SAMPLE = 2048 # davinci (pre-002) has a max context length of 2048, not 4096

tokenizer = GPT2TokenizerFast.from_pretrained('gpt2-xl')
def count_tokens(s: str) -> int:
    return len(tokenizer(s).input_ids)

# find all the files
files = []
for corpus_dir in CORPUS_DIRS:
    for draft in os.listdir(corpus_dir):
        files.append((
            os.path.join(corpus_dir, draft),
            corpus_dir.split(os.sep)[-2] + os.sep + draft,
        ))

# read all my writing and collect them up in a list
drafts = []
docs = []
for (filepath, draft) in (bar := tqdm(files)):
    bar.set_description('Preprocessing files')
    with open(filepath, 'r') as f:
        lines = f.readlines()

        # get rid of all the Markdown front matter
        if lines[0] == '---\n':
            lines = lines[1:]
            while lines[0] != '---\n':
                lines = lines[1:]
            lines = lines[1:]

        content = ''.join(lines).strip()
        paras = [p.strip() for p in content.split('\n\n') if p.strip() != '']

        # split long documents into short ones that fit in one GPT-3 context each
        paras_sofar = []
        for p in paras:
            tokens_sofar = count_tokens('\n\n'.join(paras_sofar + [p]))
            if tokens_sofar > MAX_TOKENS_PER_SAMPLE:
                drafts.append(draft)
                docs.append('\n\n'.join(paras_sofar))
                paras_sofar = []
            paras_sofar.append(p)
        if len(paras_sofar) > 0:
            drafts.append(draft)
            docs.append('\n\n'.join(paras_sofar))

# tokenize using the GPT-2 tokenizer, which approximates GPT-3's tokenizer
# extremely well in my testing
docs_bar = tqdm(docs)
docs_bar.set_description('Tokenizing')
token_counts = [count_tokens(doc) for doc in docs_bar]
finetune_docs = [{ 'prompt': '', 'completion': text.strip() } for text in docs]

# report some stats
count = len(finetune_docs)
size = len(''.join(docs))
tokens_estimate = sum(token_counts)
mean_tokens = tokens_estimate / len(docs)
print(f'''Creating finetuning JSONL data with:
    {count} drafts
    {size//1000} KB
    {tokens_estimate:.2f} tokens ({mean_tokens:.2f}tok/sample)''')
print(f'''At this size, training for 4 epochs will cost:
    ada     ${0.0004 * tokens_estimate / 1000 * 4:.2f}
    babbage ${0.0006 * tokens_estimate / 1000 * 4:.2f}
    curie   ${0.003 * tokens_estimate / 1000 * 4:.2f}
    davinci ${0.03 * tokens_estimate / 1000 * 4:.2f}''')

# write dataset JSONL file
with open(FILENAME, 'w+') as f:
    for doc in finetune_docs:
        json.dump(doc, f)
        f.write('\n')


## train.sh
#!/bin/bash

# environment
export OPENAI_API_KEY=sk-XXXX
export FILENAME=thesephist.jsonl
export MODEL_NAME=davinci

# train
source venv/bin/activate
openai api fine_tunes.create -t $FILENAME -m $MODEL_NAME --suffix "text-davinci-thesephist"
	#!/usr/bin/env python

	import os
	import json
	from tqdm import tqdm
	from transformers import GPT2TokenizerFast

	os.environ['TOKENIZERS_PARALLELISM'] = 'false'

	FILENAME = './thesephist.jsonl'
	CORPUS_DIRS = [
	'/Users/thesephist/src/www/content/posts',
	'/Users/thesephist/src/dotink/content/posts',
	'/Users/thesephist/src/coffee/content/note',
	]
	MAX_TOKENS_PER_SAMPLE = 2048 # davinci (pre-002) has a max context length of 2048, not 4096

	tokenizer = GPT2TokenizerFast.from_pretrained('gpt2-xl')
	def count_tokens(s: str) -> int:
	return len(tokenizer(s).input_ids)

	# find all the files
	files = []
	for corpus_dir in CORPUS_DIRS:
	for draft in os.listdir(corpus_dir):
	files.append((
	os.path.join(corpus_dir, draft),
	corpus_dir.split(os.sep)[-2] + os.sep + draft,
	))

	# read all my writing and collect them up in a list
	drafts = []
	docs = []
	for (filepath, draft) in (bar := tqdm(files)):
	bar.set_description('Preprocessing files')
	with open(filepath, 'r') as f:
	lines = f.readlines()

	# get rid of all the Markdown front matter
	if lines[0] == '---\n':
	lines = lines[1:]
	while lines[0] != '---\n':
	lines = lines[1:]
	lines = lines[1:]

	content = ''.join(lines).strip()
	paras = [p.strip() for p in content.split('\n\n') if p.strip() != '']

	# split long documents into short ones that fit in one GPT-3 context each
	paras_sofar = []
	for p in paras:
	tokens_sofar = count_tokens('\n\n'.join(paras_sofar + [p]))
	if tokens_sofar > MAX_TOKENS_PER_SAMPLE:
	drafts.append(draft)
	docs.append('\n\n'.join(paras_sofar))
	paras_sofar = []
	paras_sofar.append(p)
	if len(paras_sofar) > 0:
	drafts.append(draft)
	docs.append('\n\n'.join(paras_sofar))

	# tokenize using the GPT-2 tokenizer, which approximates GPT-3's tokenizer
	# extremely well in my testing
	docs_bar = tqdm(docs)
	docs_bar.set_description('Tokenizing')
	token_counts = [count_tokens(doc) for doc in docs_bar]
	finetune_docs = [{ 'prompt': '', 'completion': text.strip() } for text in docs]

	# report some stats
	count = len(finetune_docs)
	size = len(''.join(docs))
	tokens_estimate = sum(token_counts)
	mean_tokens = tokens_estimate / len(docs)
	print(f'''Creating finetuning JSONL data with:
	{count} drafts
	{size//1000} KB
	{tokens_estimate:.2f} tokens ({mean_tokens:.2f}tok/sample)''')
	print(f'''At this size, training for 4 epochs will cost:
	ada ${0.0004 * tokens_estimate / 1000 * 4:.2f}
	babbage ${0.0006 * tokens_estimate / 1000 * 4:.2f}
	curie ${0.003 * tokens_estimate / 1000 * 4:.2f}
	davinci ${0.03 * tokens_estimate / 1000 * 4:.2f}''')

	# write dataset JSONL file
	with open(FILENAME, 'w+') as f:
	for doc in finetune_docs:
	json.dump(doc, f)
	f.write('\n')
	#!/bin/bash

	# environment
	export OPENAI_API_KEY=sk-XXXX
	export FILENAME=thesephist.jsonl
	export MODEL_NAME=davinci

	# train
	source venv/bin/activate
	openai api fine_tunes.create -t $FILENAME -m $MODEL_NAME --suffix "text-davinci-thesephist"