-
-
Save recordcrash/1f2204c82e3142328508037cf74951ad to your computer and use it in GitHub Desktop.
Nachlass Notebook (Discord messages to OpenAI davinci finetuning)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#\n", | |
"# DATA PROCESSING\n", | |
"#" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"OPENAI_API_KEY=\"sk-something\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Load 'messages5m.json' into dataframe\n", | |
"# Alternatively, 'messages_all.json' with 60 million lines can be used\n", | |
"with open('messages_all.json', encoding='UTF8') as f:\n", | |
" data = json.load(f)\n", | |
"df = pd.DataFrame(data[\"messages\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Check dataframe\n", | |
"df.head(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Edit author column so only the 'name' attribute of the JSON object in author is kept\n", | |
"df['author'] = df['author'].apply(lambda x: x['name'])\n", | |
"df.sample(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Keep only the needed columns (id, type, timestamp, content, author, mentions)\n", | |
"df = df[['id', 'type', 'timestamp', 'content', 'author', 'mentions']]\n", | |
"# Remove the string \"@Drew Linky\" from the content\n", | |
"df['content'] = df['content'].apply(lambda x: x.replace('@Drew Linky ', ''))\n", | |
"df['content'] = df['content'].apply(lambda x: x.replace('@Drew Linky', ''))\n", | |
"# Remove the string ### from the content just in case it messes with the prompt, same with ' END'\n", | |
"df['content'] = df['content'].apply(lambda x: x.replace('###', ''))\n", | |
"df['content'] = df['content'].apply(lambda x: x.replace(' END', ''))\n", | |
"# Erase all messages with no content after the ping has been removed\n", | |
"df = df[df['content'] != '']\n", | |
"# Erase all messages that are just 'Pinned a message.'\n", | |
"df = df[df['content'] != 'Pinned a message.']\n", | |
"# Reindex\n", | |
"df = df.reset_index(drop=True)\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Mentions is an array of JSON objects. Convert the mentions column to a string of the first name in the array.\n", | |
"df['mentions'] = df['mentions'].apply(lambda x: x[0]['name'] if len(x) > 0 else None)\n", | |
"df.sample(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Process timestamp column in format YYYY-MM-DDTHH:MM:SS.MMM+00:00 into a pandas datetime object\n", | |
"df['timestamp'] = pd.to_datetime(df['timestamp'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Output all messages that mention \"Drew Linky\"\n", | |
"drew_linky_mentions_df = df[df['mentions'] == 'Drew Linky']\n", | |
"drew_linky_mentions_df.sample(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Get the indices of the messages that mention \"Drew Linky\"\n", | |
"drew_linky_mentions_indices = drew_linky_mentions_df.index\n", | |
"drew_linky_mentions_indices" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Helper function\n", | |
"# Starting from an index of a dataframe, go down the dataframe until a message is found by author \"Drew Linky\"\n", | |
"# Return an array of all consecutive messages by Drew Linky. If a new author appears, stop and return.\n", | |
"def get_drew_linky_consecutive_messages(df, index):\n", | |
" messages = []\n", | |
" original_message = df.iloc[index]\n", | |
" datetime = original_message['timestamp']\n", | |
" # Drew mode is when we start processing drew's messages\n", | |
" drew_mode = False\n", | |
" while index < (len(df) - 1):\n", | |
" message = df.iloc[index]\n", | |
" index += 1\n", | |
" # Break if the message is more than 2 minutes after the original message\n", | |
" if (message['timestamp'] - datetime).total_seconds() > 120:\n", | |
" break\n", | |
" if message['author'] == 'Drew Linky' and message['content'] != original_message['content'] and message[\n", | |
" 'content'] != '':\n", | |
" drew_mode = True\n", | |
" messages.append(message['content'])\n", | |
" # If a new author appears, stop\n", | |
" elif drew_mode:\n", | |
" break\n", | |
" return messages" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Apply the helper function to all the indexes in drew_linky_mentions_indices, returning an array of prompt objects\n", | |
"def get_drew_linky_consecutive_messages_array(df, drew_linky_mentions_indices):\n", | |
" drew_linky_consecutive_messages_array = []\n", | |
" drew_linky_consecutive_messages_prompt_array = []\n", | |
" for index in drew_linky_mentions_indices:\n", | |
" # Use the original message content as the index\n", | |
" original_message_content = df.iloc[index]['content']\n", | |
" drew_consecutive_messages = get_drew_linky_consecutive_messages(df, index)\n", | |
" if len(drew_consecutive_messages) > 0:\n", | |
" drew_consecutive_string = '\\n'.join(drew_consecutive_messages)\n", | |
" # Collapse the array of messages into a single string separated by \\n\n", | |
" # Also apply openAI prompt suggestions\n", | |
" drew_prompt_object = {\"prompt\": original_message_content + \"###\",\n", | |
" \"completion\": \" \" + drew_consecutive_string + \" END\"}\n", | |
" # Now one without the prompt structure, for chatgpt training\n", | |
" user_message = {\"role\": \"user\", \"content\": original_message_content}\n", | |
" assistant_response = {\"role\": \"assistant\", \"content\": drew_consecutive_string}\n", | |
" drew_linky_consecutive_messages_array.extend([user_message, assistant_response])\n", | |
" drew_linky_consecutive_messages_prompt_array.append(drew_prompt_object)\n", | |
" return drew_linky_consecutive_messages_prompt_array, drew_linky_consecutive_messages_array" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Get the arrays\n", | |
"drew_linky_consecutive_messages_prompt_array, drew_linky_consecutive_messages_array = get_drew_linky_consecutive_messages_array(df, drew_linky_mentions_indices)\n", | |
"# Print number of keys\n", | |
"print(f\"{len(drew_linky_consecutive_messages_array)} elements\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"pycharm": { | |
"is_executing": true | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Delete jsonl if it already exists\n", | |
"import os\n", | |
"\n", | |
"if os.path.exists('drew_linky_consecutive_messages_prompt.jsonl'):\n", | |
" os.remove('drew_linky_consecutive_messages_prompt.jsonl')\n", | |
"\n", | |
"if os.path.exists('drew_linky_consecutive_messages.jsonl'):\n", | |
" os.remove('drew_linky_consecutive_messages.jsonl')\n", | |
"\n", | |
"# Convert to jsonl and save\n", | |
"import jsonlines\n", | |
"\n", | |
"with jsonlines.open('drew_linky_consecutive_messages.jsonl', mode='w') as writer:\n", | |
" writer.write_all(drew_linky_consecutive_messages_array)\n", | |
"\n", | |
"with jsonlines.open('drew_linky_consecutive_messages_prompt.jsonl', mode='w') as writer:\n", | |
" writer.write_all(drew_linky_consecutive_messages_prompt_array)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Open jsonl\n", | |
"with jsonlines.open('drew_linky_consecutive_messages.jsonl') as reader:\n", | |
" for obj in reader:\n", | |
" print(obj)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 225, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#\n", | |
"# FINETUNING GPT\n", | |
"#" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 243, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Better to launch the commands in console, jupyter isn't that good at handling the output\n", | |
"# ! env OPENAI_API_KEY=\"sk-...\" openai api fine_tunes.create -t drew_linky_consecutive_messages.jsonl -m davinci --suffix \"drew_linky\"\n", | |
"# ! OPENAI_API_KEY=\"sk-...\" openai api fine_tunes.follow -i ft-modelname\n", | |
"# With davinci and 7285 finetuning elements, it took 7 minutes to tell me it would cost 27 bucks and enter the queue (be warned that with ada it just hung forever in the queue,\n", | |
"# I think openAI prioritizes big bucks), 35 minutes to start training, and 23 minutes per epoch out of 4 epochs, so well over one hour to train." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Fill this in\n", | |
"question = \"what wand length should I pick between 9.5 and 14.5 inches?\"\n", | |
"\n", | |
"answer = query_openai_model(text_to_prompt(question))\n", | |
"\n", | |
"pretty_print(question, answer)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment