Skip to content

Instantly share code, notes, and snippets.

@recordcrash
Created April 7, 2023 13:58
Show Gist options
  • Save recordcrash/1f2204c82e3142328508037cf74951ad to your computer and use it in GitHub Desktop.
Save recordcrash/1f2204c82e3142328508037cf74951ad to your computer and use it in GitHub Desktop.
Nachlass Notebook (Discord messages to OpenAI davinci finetuning)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#\n",
"# DATA PROCESSING\n",
"#"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"OPENAI_API_KEY=\"sk-something\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"import json\n",
"\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"# Load 'messages5m.json' into dataframe\n",
"# Alternatively, 'messages_all.json' with 60 million lines can be used\n",
"with open('messages_all.json', encoding='UTF8') as f:\n",
" data = json.load(f)\n",
"df = pd.DataFrame(data[\"messages\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"# Check dataframe\n",
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"# Edit author column so only the 'name' attribute of the JSON object in author is kept\n",
"df['author'] = df['author'].apply(lambda x: x['name'])\n",
"df.sample(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"# Keep only the needed columns (id, type, timestamp, content, author, mentions)\n",
"df = df[['id', 'type', 'timestamp', 'content', 'author', 'mentions']]\n",
"# Remove the string \"@Drew Linky\" from the content\n",
"df['content'] = df['content'].apply(lambda x: x.replace('@Drew Linky ', ''))\n",
"df['content'] = df['content'].apply(lambda x: x.replace('@Drew Linky', ''))\n",
"# Remove the string ### from the content just in case it messes with the prompt, same with ' END'\n",
"df['content'] = df['content'].apply(lambda x: x.replace('###', ''))\n",
"df['content'] = df['content'].apply(lambda x: x.replace(' END', ''))\n",
"# Erase all messages with no content after the ping has been removed\n",
"df = df[df['content'] != '']\n",
"# Erase all messages that are just 'Pinned a message.'\n",
"df = df[df['content'] != 'Pinned a message.']\n",
"# Reindex\n",
"df = df.reset_index(drop=True)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"# Mentions is an array of JSON objects. Convert the mentions column to a string of the first name in the array.\n",
"df['mentions'] = df['mentions'].apply(lambda x: x[0]['name'] if len(x) > 0 else None)\n",
"df.sample(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"# Process timestamp column in format YYYY-MM-DDTHH:MM:SS.MMM+00:00 into a pandas datetime object\n",
"df['timestamp'] = pd.to_datetime(df['timestamp'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"# Output all messages that mention \"Drew Linky\"\n",
"drew_linky_mentions_df = df[df['mentions'] == 'Drew Linky']\n",
"drew_linky_mentions_df.sample(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"# Get the indices of the messages that mention \"Drew Linky\"\n",
"drew_linky_mentions_indices = drew_linky_mentions_df.index\n",
"drew_linky_mentions_indices"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"# Helper function\n",
"# Starting from an index of a dataframe, go down the dataframe until a message is found by author \"Drew Linky\"\n",
"# Return an array of all consecutive messages by Drew Linky. If a new author appears, stop and return.\n",
"def get_drew_linky_consecutive_messages(df, index):\n",
" messages = []\n",
" original_message = df.iloc[index]\n",
" datetime = original_message['timestamp']\n",
" # Drew mode is when we start processing drew's messages\n",
" drew_mode = False\n",
" while index < (len(df) - 1):\n",
" message = df.iloc[index]\n",
" index += 1\n",
" # Break if the message is more than 2 minutes after the original message\n",
" if (message['timestamp'] - datetime).total_seconds() > 120:\n",
" break\n",
" if message['author'] == 'Drew Linky' and message['content'] != original_message['content'] and message[\n",
" 'content'] != '':\n",
" drew_mode = True\n",
" messages.append(message['content'])\n",
" # If a new author appears, stop\n",
" elif drew_mode:\n",
" break\n",
" return messages"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"# Apply the helper function to all the indexes in drew_linky_mentions_indices, returning an array of prompt objects\n",
"def get_drew_linky_consecutive_messages_array(df, drew_linky_mentions_indices):\n",
" drew_linky_consecutive_messages_array = []\n",
" drew_linky_consecutive_messages_prompt_array = []\n",
" for index in drew_linky_mentions_indices:\n",
" # Use the original message content as the index\n",
" original_message_content = df.iloc[index]['content']\n",
" drew_consecutive_messages = get_drew_linky_consecutive_messages(df, index)\n",
" if len(drew_consecutive_messages) > 0:\n",
" drew_consecutive_string = '\\n'.join(drew_consecutive_messages)\n",
" # Collapse the array of messages into a single string separated by \\n\n",
" # Also apply openAI prompt suggestions\n",
" drew_prompt_object = {\"prompt\": original_message_content + \"###\",\n",
" \"completion\": \" \" + drew_consecutive_string + \" END\"}\n",
" # Now one without the prompt structure, for chatgpt training\n",
" user_message = {\"role\": \"user\", \"content\": original_message_content}\n",
" assistant_response = {\"role\": \"assistant\", \"content\": drew_consecutive_string}\n",
" drew_linky_consecutive_messages_array.extend([user_message, assistant_response])\n",
" drew_linky_consecutive_messages_prompt_array.append(drew_prompt_object)\n",
" return drew_linky_consecutive_messages_prompt_array, drew_linky_consecutive_messages_array"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"# Get the arrays\n",
"drew_linky_consecutive_messages_prompt_array, drew_linky_consecutive_messages_array = get_drew_linky_consecutive_messages_array(df, drew_linky_mentions_indices)\n",
"# Print number of keys\n",
"print(f\"{len(drew_linky_consecutive_messages_array)} elements\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"# Delete jsonl if it already exists\n",
"import os\n",
"\n",
"if os.path.exists('drew_linky_consecutive_messages_prompt.jsonl'):\n",
" os.remove('drew_linky_consecutive_messages_prompt.jsonl')\n",
"\n",
"if os.path.exists('drew_linky_consecutive_messages.jsonl'):\n",
" os.remove('drew_linky_consecutive_messages.jsonl')\n",
"\n",
"# Convert to jsonl and save\n",
"import jsonlines\n",
"\n",
"with jsonlines.open('drew_linky_consecutive_messages.jsonl', mode='w') as writer:\n",
" writer.write_all(drew_linky_consecutive_messages_array)\n",
"\n",
"with jsonlines.open('drew_linky_consecutive_messages_prompt.jsonl', mode='w') as writer:\n",
" writer.write_all(drew_linky_consecutive_messages_prompt_array)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Open jsonl\n",
"with jsonlines.open('drew_linky_consecutive_messages.jsonl') as reader:\n",
" for obj in reader:\n",
" print(obj)"
]
},
{
"cell_type": "code",
"execution_count": 225,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#\n",
"# FINETUNING GPT\n",
"#"
]
},
{
"cell_type": "code",
"execution_count": 243,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Better to launch the commands in console, jupyter isn't that good at handling the output\n",
"# ! env OPENAI_API_KEY=\"sk-...\" openai api fine_tunes.create -t drew_linky_consecutive_messages.jsonl -m davinci --suffix \"drew_linky\"\n",
"# ! OPENAI_API_KEY=\"sk-...\" openai api fine_tunes.follow -i ft-modelname\n",
"# With davinci and 7285 finetuning elements, it took 7 minutes to tell me it would cost 27 bucks and enter the queue (be warned that with ada it just hung forever in the queue,\n",
"# I think openAI prioritizes big bucks), 35 minutes to start training, and 23 minutes per epoch out of 4 epochs, so well over one hour to train."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Fill this in\n",
"question = \"what wand length should I pick between 9.5 and 14.5 inches?\"\n",
"\n",
"answer = query_openai_model(text_to_prompt(question))\n",
"\n",
"pretty_print(question, answer)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment