recordcrash/generate_nachlass.ipynb Secret

## generate_nachlass.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#\n",
    "# DATA PROCESSING\n",
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "OPENAI_API_KEY=\"sk-something\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "# Load 'messages5m.json' into dataframe\n",
    "# Alternatively, 'messages_all.json' with 60 million lines can be used\n",
    "with open('messages_all.json', encoding='UTF8') as f:\n",
    "    data = json.load(f)\n",
    "df = pd.DataFrame(data[\"messages\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "# Check dataframe\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "# Edit author column so only the 'name' attribute of the JSON object in author is kept\n",
    "df['author'] = df['author'].apply(lambda x: x['name'])\n",
    "df.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "# Keep only the needed columns (id, type, timestamp, content, author, mentions)\n",
    "df = df[['id', 'type', 'timestamp', 'content', 'author', 'mentions']]\n",
    "# Remove the string \"@Drew Linky\" from the content\n",
    "df['content'] = df['content'].apply(lambda x: x.replace('@Drew Linky ', ''))\n",
    "df['content'] = df['content'].apply(lambda x: x.replace('@Drew Linky', ''))\n",
    "# Remove the string ### from the content just in case it messes with the prompt, same with ' END'\n",
    "df['content'] = df['content'].apply(lambda x: x.replace('###', ''))\n",
    "df['content'] = df['content'].apply(lambda x: x.replace(' END', ''))\n",
    "# Erase all messages with no content after the ping has been removed\n",
    "df = df[df['content'] != '']\n",
    "# Erase all messages that are just 'Pinned a message.'\n",
    "df = df[df['content'] != 'Pinned a message.']\n",
    "# Reindex\n",
    "df = df.reset_index(drop=True)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "# Mentions is an array of JSON objects. Convert the mentions column to a string of the first name in the array.\n",
    "df['mentions'] = df['mentions'].apply(lambda x: x[0]['name'] if len(x) > 0 else None)\n",
    "df.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "# Process timestamp column in format YYYY-MM-DDTHH:MM:SS.MMM+00:00 into a pandas datetime object\n",
    "df['timestamp'] = pd.to_datetime(df['timestamp'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "# Output all messages that mention \"Drew Linky\"\n",
    "drew_linky_mentions_df = df[df['mentions'] == 'Drew Linky']\n",
    "drew_linky_mentions_df.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "# Get the indices of the messages that mention \"Drew Linky\"\n",
    "drew_linky_mentions_indices = drew_linky_mentions_df.index\n",
    "drew_linky_mentions_indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "# Helper function\n",
    "# Starting from an index of a dataframe, go down the dataframe until a message is found by author \"Drew Linky\"\n",
    "# Return an array of all consecutive messages by Drew Linky. If a new author appears, stop and return.\n",
    "def get_drew_linky_consecutive_messages(df, index):\n",
    "    messages = []\n",
    "    original_message = df.iloc[index]\n",
    "    datetime = original_message['timestamp']\n",
    "    # Drew mode is when we start processing drew's messages\n",
    "    drew_mode = False\n",
    "    while index < (len(df) - 1):\n",
    "        message = df.iloc[index]\n",
    "        index += 1\n",
    "        # Break if the message is more than 2 minutes after the original message\n",
    "        if (message['timestamp'] - datetime).total_seconds() > 120:\n",
    "            break\n",
    "        if message['author'] == 'Drew Linky' and message['content'] != original_message['content'] and message[\n",
    "            'content'] != '':\n",
    "            drew_mode = True\n",
    "            messages.append(message['content'])\n",
    "        # If a new author appears, stop\n",
    "        elif drew_mode:\n",
    "            break\n",
    "    return messages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "# Apply the helper function to all the indexes in drew_linky_mentions_indices, returning an array of prompt objects\n",
    "def get_drew_linky_consecutive_messages_array(df, drew_linky_mentions_indices):\n",
    "    drew_linky_consecutive_messages_array = []\n",
    "    drew_linky_consecutive_messages_prompt_array = []\n",
    "    for index in drew_linky_mentions_indices:\n",
    "        # Use the original message content as the index\n",
    "        original_message_content = df.iloc[index]['content']\n",
    "        drew_consecutive_messages = get_drew_linky_consecutive_messages(df, index)\n",
    "        if len(drew_consecutive_messages) > 0:\n",
    "            drew_consecutive_string = '\\n'.join(drew_consecutive_messages)\n",
    "            # Collapse the array of messages into a single string separated by \\n\n",
    "            # Also apply openAI prompt suggestions\n",
    "            drew_prompt_object = {\"prompt\": original_message_content + \"###\",\n",
    "                                                      \"completion\": \" \" + drew_consecutive_string + \" END\"}\n",
    "            # Now one without the prompt structure, for chatgpt training\n",
    "            user_message = {\"role\": \"user\", \"content\": original_message_content}\n",
    "            assistant_response = {\"role\": \"assistant\", \"content\": drew_consecutive_string}\n",
    "            drew_linky_consecutive_messages_array.extend([user_message, assistant_response])\n",
    "            drew_linky_consecutive_messages_prompt_array.append(drew_prompt_object)\n",
    "    return drew_linky_consecutive_messages_prompt_array, drew_linky_consecutive_messages_array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "# Get the arrays\n",
    "drew_linky_consecutive_messages_prompt_array, drew_linky_consecutive_messages_array = get_drew_linky_consecutive_messages_array(df, drew_linky_mentions_indices)\n",
    "# Print number of keys\n",
    "print(f\"{len(drew_linky_consecutive_messages_array)} elements\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "# Delete jsonl if it already exists\n",
    "import os\n",
    "\n",
    "if os.path.exists('drew_linky_consecutive_messages_prompt.jsonl'):\n",
    "    os.remove('drew_linky_consecutive_messages_prompt.jsonl')\n",
    "\n",
    "if os.path.exists('drew_linky_consecutive_messages.jsonl'):\n",
    "    os.remove('drew_linky_consecutive_messages.jsonl')\n",
    "\n",
    "# Convert to jsonl and save\n",
    "import jsonlines\n",
    "\n",
    "with jsonlines.open('drew_linky_consecutive_messages.jsonl', mode='w') as writer:\n",
    "    writer.write_all(drew_linky_consecutive_messages_array)\n",
    "\n",
    "with jsonlines.open('drew_linky_consecutive_messages_prompt.jsonl', mode='w') as writer:\n",
    "    writer.write_all(drew_linky_consecutive_messages_prompt_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Open jsonl\n",
    "with jsonlines.open('drew_linky_consecutive_messages.jsonl') as reader:\n",
    "    for obj in reader:\n",
    "        print(obj)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#\n",
    "# FINETUNING GPT\n",
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 243,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Better to launch the commands in console, jupyter isn't that good at handling the output\n",
    "# ! env OPENAI_API_KEY=\"sk-...\" openai api fine_tunes.create -t drew_linky_consecutive_messages.jsonl -m davinci --suffix \"drew_linky\"\n",
    "# ! OPENAI_API_KEY=\"sk-...\" openai api fine_tunes.follow -i ft-modelname\n",
    "# With davinci and 7285 finetuning elements, it took 7 minutes to tell me it would cost 27 bucks and enter the queue (be warned that with ada it just hung forever in the queue,\n",
    "# I think openAI prioritizes big bucks), 35 minutes to start training, and 23 minutes per epoch out of 4 epochs, so well over one hour to train."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Fill this in\n",
    "question = \"what wand length should I pick between 9.5 and 14.5 inches?\"\n",
    "\n",
    "answer = query_openai_model(text_to_prompt(question))\n",
    "\n",
    "pretty_print(question, answer)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#\n",
	"# DATA PROCESSING\n",
	"#"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"OPENAI_API_KEY=\"sk-something\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"import json\n",
	"\n",
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"# Load 'messages5m.json' into dataframe\n",
	"# Alternatively, 'messages_all.json' with 60 million lines can be used\n",
	"with open('messages_all.json', encoding='UTF8') as f:\n",
	" data = json.load(f)\n",
	"df = pd.DataFrame(data[\"messages\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"# Check dataframe\n",
	"df.head(5)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"# Edit author column so only the 'name' attribute of the JSON object in author is kept\n",
	"df['author'] = df['author'].apply(lambda x: x['name'])\n",
	"df.sample(5)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"# Keep only the needed columns (id, type, timestamp, content, author, mentions)\n",
	"df = df[['id', 'type', 'timestamp', 'content', 'author', 'mentions']]\n",
	"# Remove the string \"@Drew Linky\" from the content\n",
	"df['content'] = df['content'].apply(lambda x: x.replace('@Drew Linky ', ''))\n",
	"df['content'] = df['content'].apply(lambda x: x.replace('@Drew Linky', ''))\n",
	"# Remove the string ### from the content just in case it messes with the prompt, same with ' END'\n",
	"df['content'] = df['content'].apply(lambda x: x.replace('###', ''))\n",
	"df['content'] = df['content'].apply(lambda x: x.replace(' END', ''))\n",
	"# Erase all messages with no content after the ping has been removed\n",
	"df = df[df['content'] != '']\n",
	"# Erase all messages that are just 'Pinned a message.'\n",
	"df = df[df['content'] != 'Pinned a message.']\n",
	"# Reindex\n",
	"df = df.reset_index(drop=True)\n",
	"df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"# Mentions is an array of JSON objects. Convert the mentions column to a string of the first name in the array.\n",
	"df['mentions'] = df['mentions'].apply(lambda x: x[0]['name'] if len(x) > 0 else None)\n",
	"df.sample(5)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"# Process timestamp column in format YYYY-MM-DDTHH:MM:SS.MMM+00:00 into a pandas datetime object\n",
	"df['timestamp'] = pd.to_datetime(df['timestamp'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"# Output all messages that mention \"Drew Linky\"\n",
	"drew_linky_mentions_df = df[df['mentions'] == 'Drew Linky']\n",
	"drew_linky_mentions_df.sample(5)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"# Get the indices of the messages that mention \"Drew Linky\"\n",
	"drew_linky_mentions_indices = drew_linky_mentions_df.index\n",
	"drew_linky_mentions_indices"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"# Helper function\n",
	"# Starting from an index of a dataframe, go down the dataframe until a message is found by author \"Drew Linky\"\n",
	"# Return an array of all consecutive messages by Drew Linky. If a new author appears, stop and return.\n",
	"def get_drew_linky_consecutive_messages(df, index):\n",
	" messages = []\n",
	" original_message = df.iloc[index]\n",
	" datetime = original_message['timestamp']\n",
	" # Drew mode is when we start processing drew's messages\n",
	" drew_mode = False\n",
	" while index < (len(df) - 1):\n",
	" message = df.iloc[index]\n",
	" index += 1\n",
	" # Break if the message is more than 2 minutes after the original message\n",
	" if (message['timestamp'] - datetime).total_seconds() > 120:\n",
	" break\n",
	" if message['author'] == 'Drew Linky' and message['content'] != original_message['content'] and message[\n",
	" 'content'] != '':\n",
	" drew_mode = True\n",
	" messages.append(message['content'])\n",
	" # If a new author appears, stop\n",
	" elif drew_mode:\n",
	" break\n",
	" return messages"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"# Apply the helper function to all the indexes in drew_linky_mentions_indices, returning an array of prompt objects\n",
	"def get_drew_linky_consecutive_messages_array(df, drew_linky_mentions_indices):\n",
	" drew_linky_consecutive_messages_array = []\n",
	" drew_linky_consecutive_messages_prompt_array = []\n",
	" for index in drew_linky_mentions_indices:\n",
	" # Use the original message content as the index\n",
	" original_message_content = df.iloc[index]['content']\n",
	" drew_consecutive_messages = get_drew_linky_consecutive_messages(df, index)\n",
	" if len(drew_consecutive_messages) > 0:\n",
	" drew_consecutive_string = '\\n'.join(drew_consecutive_messages)\n",
	" # Collapse the array of messages into a single string separated by \\n\n",
	" # Also apply openAI prompt suggestions\n",
	" drew_prompt_object = {\"prompt\": original_message_content + \"###\",\n",
	" \"completion\": \" \" + drew_consecutive_string + \" END\"}\n",
	" # Now one without the prompt structure, for chatgpt training\n",
	" user_message = {\"role\": \"user\", \"content\": original_message_content}\n",
	" assistant_response = {\"role\": \"assistant\", \"content\": drew_consecutive_string}\n",
	" drew_linky_consecutive_messages_array.extend([user_message, assistant_response])\n",
	" drew_linky_consecutive_messages_prompt_array.append(drew_prompt_object)\n",
	" return drew_linky_consecutive_messages_prompt_array, drew_linky_consecutive_messages_array"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"# Get the arrays\n",
	"drew_linky_consecutive_messages_prompt_array, drew_linky_consecutive_messages_array = get_drew_linky_consecutive_messages_array(df, drew_linky_mentions_indices)\n",
	"# Print number of keys\n",
	"print(f\"{len(drew_linky_consecutive_messages_array)} elements\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"pycharm": {
	"is_executing": true
	}
	},
	"outputs": [],
	"source": [
	"# Delete jsonl if it already exists\n",
	"import os\n",
	"\n",
	"if os.path.exists('drew_linky_consecutive_messages_prompt.jsonl'):\n",
	" os.remove('drew_linky_consecutive_messages_prompt.jsonl')\n",
	"\n",
	"if os.path.exists('drew_linky_consecutive_messages.jsonl'):\n",
	" os.remove('drew_linky_consecutive_messages.jsonl')\n",
	"\n",
	"# Convert to jsonl and save\n",
	"import jsonlines\n",
	"\n",
	"with jsonlines.open('drew_linky_consecutive_messages.jsonl', mode='w') as writer:\n",
	" writer.write_all(drew_linky_consecutive_messages_array)\n",
	"\n",
	"with jsonlines.open('drew_linky_consecutive_messages_prompt.jsonl', mode='w') as writer:\n",
	" writer.write_all(drew_linky_consecutive_messages_prompt_array)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Open jsonl\n",
	"with jsonlines.open('drew_linky_consecutive_messages.jsonl') as reader:\n",
	" for obj in reader:\n",
	" print(obj)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 225,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#\n",
	"# FINETUNING GPT\n",
	"#"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 243,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Better to launch the commands in console, jupyter isn't that good at handling the output\n",
	"# ! env OPENAI_API_KEY=\"sk-...\" openai api fine_tunes.create -t drew_linky_consecutive_messages.jsonl -m davinci --suffix \"drew_linky\"\n",
	"# ! OPENAI_API_KEY=\"sk-...\" openai api fine_tunes.follow -i ft-modelname\n",
	"# With davinci and 7285 finetuning elements, it took 7 minutes to tell me it would cost 27 bucks and enter the queue (be warned that with ada it just hung forever in the queue,\n",
	"# I think openAI prioritizes big bucks), 35 minutes to start training, and 23 minutes per epoch out of 4 epochs, so well over one hour to train."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Fill this in\n",
	"question = \"what wand length should I pick between 9.5 and 14.5 inches?\"\n",
	"\n",
	"answer = query_openai_model(text_to_prompt(question))\n",
	"\n",
	"pretty_print(question, answer)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}