Created
November 22, 2021 06:47
-
-
Save pszemraj/a04056ff80ef4fe00b807d36286ad1d6 to your computer and use it in GitHub Desktop.
convert whatsapp export to GPT2 script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "convert whatsapp export to GPT2 script", | |
"provenance": [], | |
"collapsed_sections": [], | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"name": "python3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/pszemraj/a04056ff80ef4fe00b807d36286ad1d6/convert-whatsapp-export-to-gpt2-script.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "T3gILJgPYbkK" | |
}, | |
"source": [ | |
"# Converting Whatsapp Exports to GPT-2 Training \n", | |
"\n", | |
"- explores / illustrates how to convert a text file that is exported from whatsapp messages to the general `script` format used by `aitextgen` to train a GPT-2 chatbot model\n", | |
"- note that all names, etc have been replaced in the example text file used here. \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "9z48V7uCZBZx" | |
}, | |
"source": [ | |
"## formatting\n", | |
"\n", | |
"from IPython.display import HTML, display\n", | |
"# colab formatting\n", | |
"def set_css():\n", | |
" display(\n", | |
" HTML(\n", | |
" \"\"\"\n", | |
" <style>\n", | |
" pre {\n", | |
" white-space: pre-wrap;\n", | |
" }\n", | |
" </style>\n", | |
" \"\"\"\n", | |
" )\n", | |
" )\n", | |
"\n", | |
"get_ipython().events.register(\"pre_run_cell\", set_css)" | |
], | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"cellView": "form", | |
"id": "06iNiZ7fBn8f", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 17 | |
}, | |
"outputId": "01fa6379-63de-4e48-b4f3-e55fbdceb2ad" | |
}, | |
"source": [ | |
"dl_link = \"https://www.dropbox.com/s/loe823gu7pdra2i/_chat.txt?dl=1\" #@param {type:\"string\"}\n" | |
], | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"\n", | |
" <style>\n", | |
" pre {\n", | |
" white-space: pre-wrap;\n", | |
" }\n", | |
" </style>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "sRe_OcyXBTr2", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 17 | |
}, | |
"outputId": "6698ad56-3011-49a9-b823-9ad3fac4a15e" | |
}, | |
"source": [ | |
"import os \n", | |
"\n", | |
"vm_wd = os.getcwd()" | |
], | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"\n", | |
" <style>\n", | |
" pre {\n", | |
" white-space: pre-wrap;\n", | |
" }\n", | |
" </style>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"id": "lIYdn1woOS1n", | |
"outputId": "5674eadd-f64e-491d-f844-23f14159571d" | |
}, | |
"source": [ | |
"from urllib import request\n", | |
"from os.path import join\n", | |
"\n", | |
"local_name = join(vm_wd, \"test-text-file.txt\")\n", | |
"request.urlretrieve(dl_link, local_name)\n" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"\n", | |
" <style>\n", | |
" pre {\n", | |
" white-space: pre-wrap;\n", | |
" }\n", | |
" </style>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"('/content/test-text-file.txt', <http.client.HTTPMessage at 0x7f8459779fd0>)" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 4 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "5z0h2pCABuQ-", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "092c15ae-1f2a-4efa-bbb8-c94601e881a3" | |
}, | |
"source": [ | |
"with open(local_name, 'r', encoding='utf-8', errors='ignore') as f:\n", | |
" textlines = f.readlines()\n", | |
"\n", | |
"print(len(textlines), type(textlines))" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"\n", | |
" <style>\n", | |
" pre {\n", | |
" white-space: pre-wrap;\n", | |
" }\n", | |
" </style>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"624 <class 'list'>\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Dh7xHtWfCShN" | |
}, | |
"source": [ | |
"clean the text" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "jrnvT0DDCTxl", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 17 | |
}, | |
"outputId": "a548777d-f8b0-4452-9106-531bd21beaa0" | |
}, | |
"source": [ | |
"%%capture\n", | |
"!pip install -U clean-text\n", | |
"from cleantext import clean" | |
], | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"\n", | |
" <style>\n", | |
" pre {\n", | |
" white-space: pre-wrap;\n", | |
" }\n", | |
" </style>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "07rS0sYdCaD8", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 17 | |
}, | |
"outputId": "b2812ffb-7e3a-4474-d0b8-23ec320b7f57" | |
}, | |
"source": [ | |
"textlines = [clean(line) for line in textlines]" | |
], | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"\n", | |
" <style>\n", | |
" pre {\n", | |
" white-space: pre-wrap;\n", | |
" }\n", | |
" </style>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "B6lQIr5DB8kY", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 260 | |
}, | |
"outputId": "0e85b125-1221-4dfe-b1eb-d691450fd377" | |
}, | |
"source": [ | |
"import pprint as pp\n", | |
"\n", | |
"pp.pprint(textlines[:10])\n" | |
], | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"\n", | |
" <style>\n", | |
" pre {\n", | |
" white-space: pre-wrap;\n", | |
" }\n", | |
" </style>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"['[25.02.21, 23:57:58] alexander das great: hi kids',\n", | |
" '[26.02.21, 06:54:14] jorge: hello friends',\n", | |
" '[26.02.21, 08:13:32] olga: hi',\n", | |
" '[26.02.21, 14:18:31] alexander das great: image omitted',\n", | |
" '[26.02.21, 14:18:33] alexander das great: ayyy',\n", | |
" '[26.02.21, 14:18:51] alexander das great: when is the first thing due? maybe '\n", | |
" 'i missed it but not announced right',\n", | |
" \"[26.02.21, 14:23:47] jorge: task 0 opens monday. i don't think it's for \"\n", | |
" 'marks tho. task 1a opens monday the week after',\n", | |
" '[26.02.21, 14:23:55] jorge: looks like 2 weeks to get it done',\n", | |
" '[26.02.21, 14:34:05] olga: i am following the recordings btw, not the live '\n", | |
" 'session, because of double sheduling. i will be at the q&a sessions though',\n", | |
" '[26.02.21, 14:34:45] jorge: also not attending tutorials due to double '\n", | |
" \"scheduling. i'm sure it'll be fine\"]\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "u4BX2cSiEpDa", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 260 | |
}, | |
"outputId": "072cca15-1afa-4e15-c2a9-fb770797b1e7" | |
}, | |
"source": [ | |
"import re\n", | |
"\n", | |
"re_string = \"\\[([0-9]+(\\.[0-9]+)+), ([0-9]+(:[0-9]+)+)\\] \"\n", | |
"\n", | |
"# re.sub(pattern, repl, string, count=0, flags=0)\n", | |
"\n", | |
"sub_textlines = [re.sub(re_string, \"\", line) for line in textlines]\n", | |
"\n", | |
"\n", | |
"pp.pprint(sub_textlines[:10])\n" | |
], | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"\n", | |
" <style>\n", | |
" pre {\n", | |
" white-space: pre-wrap;\n", | |
" }\n", | |
" </style>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"['alexander das great: hi kids',\n", | |
" 'jorge: hello friends',\n", | |
" 'olga: hi',\n", | |
" 'alexander das great: image omitted',\n", | |
" 'alexander das great: ayyy',\n", | |
" 'alexander das great: when is the first thing due? maybe i missed it but not '\n", | |
" 'announced right',\n", | |
" \"jorge: task 0 opens monday. i don't think it's for marks tho. task 1a opens \"\n", | |
" 'monday the week after',\n", | |
" 'jorge: looks like 2 weeks to get it done',\n", | |
" 'olga: i am following the recordings btw, not the live session, because of '\n", | |
" 'double sheduling. i will be at the q&a sessions though',\n", | |
" \"jorge: also not attending tutorials due to double scheduling. i'm sure it'll \"\n", | |
" 'be fine']\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "xNCTigvvGc0p", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 381 | |
}, | |
"outputId": "f4bbb101-34b4-4958-a0cd-6dfff55e6e67" | |
}, | |
"source": [ | |
"fin_text = []\n", | |
"\n", | |
"for line in sub_textlines:\n", | |
" line = str(line)\n", | |
" parts = line.split(\": \")\n", | |
" if len(parts) == 2 and isinstance(parts, list):\n", | |
" fin_text.append(parts[0] + \":\\n\")\n", | |
" fin_text.append(parts[1] + \"\\n\")\n", | |
" fin_text.append(\"\\n\")\n", | |
" elif len(parts) > 2:\n", | |
" fin_text.append(parts[0] + \":\\n\")\n", | |
" fin_text.append(\" \".join(parts[1:]) + \"\\n\")\n", | |
" fin_text.append(\"\\n\")\n", | |
" else:\n", | |
" continue\n", | |
"\n", | |
"\n", | |
"\n", | |
"pp.pprint(fin_text[:20])" | |
], | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"\n", | |
" <style>\n", | |
" pre {\n", | |
" white-space: pre-wrap;\n", | |
" }\n", | |
" </style>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"['alexander das great:\\n',\n", | |
" 'hi kids\\n',\n", | |
" '\\n',\n", | |
" 'jorge:\\n',\n", | |
" 'hello friends\\n',\n", | |
" '\\n',\n", | |
" 'olga:\\n',\n", | |
" 'hi\\n',\n", | |
" '\\n',\n", | |
" 'alexander das great:\\n',\n", | |
" 'image omitted\\n',\n", | |
" '\\n',\n", | |
" 'alexander das great:\\n',\n", | |
" 'ayyy\\n',\n", | |
" '\\n',\n", | |
" 'alexander das great:\\n',\n", | |
" 'when is the first thing due? maybe i missed it but not announced right\\n',\n", | |
" '\\n',\n", | |
" 'jorge:\\n',\n", | |
" \"task 0 opens monday. i don't think it's for marks tho. task 1a opens monday \"\n", | |
" 'the week after\\n']\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "UiaxKXigIOmY" | |
}, | |
"source": [ | |
"# save\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "oZyYvlSJIPyN", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 17 | |
}, | |
"outputId": "9f94efeb-602b-45dc-d7a9-4548ea65e0ca" | |
}, | |
"source": [ | |
"outname = \"reformatted_text.txt\"\n", | |
"with open(outname, 'w', encoding='utf-8', errors='ignore') as fo:\n", | |
"\n", | |
" fo.writelines(fin_text)\n", | |
"\n", | |
"from google.colab import files\n", | |
"\n", | |
"files.download(outname)" | |
], | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"\n", | |
" <style>\n", | |
" pre {\n", | |
" white-space: pre-wrap;\n", | |
" }\n", | |
" </style>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/javascript": [ | |
"\n", | |
" async function download(id, filename, size) {\n", | |
" if (!google.colab.kernel.accessAllowed) {\n", | |
" return;\n", | |
" }\n", | |
" const div = document.createElement('div');\n", | |
" const label = document.createElement('label');\n", | |
" label.textContent = `Downloading \"${filename}\": `;\n", | |
" div.appendChild(label);\n", | |
" const progress = document.createElement('progress');\n", | |
" progress.max = size;\n", | |
" div.appendChild(progress);\n", | |
" document.body.appendChild(div);\n", | |
"\n", | |
" const buffers = [];\n", | |
" let downloaded = 0;\n", | |
"\n", | |
" const channel = await google.colab.kernel.comms.open(id);\n", | |
" // Send a message to notify the kernel that we're ready.\n", | |
" channel.send({})\n", | |
"\n", | |
" for await (const message of channel.messages) {\n", | |
" // Send a message to notify the kernel that we're ready.\n", | |
" channel.send({})\n", | |
" if (message.buffers) {\n", | |
" for (const buffer of message.buffers) {\n", | |
" buffers.push(buffer);\n", | |
" downloaded += buffer.byteLength;\n", | |
" progress.value = downloaded;\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" const blob = new Blob(buffers, {type: 'application/binary'});\n", | |
" const a = document.createElement('a');\n", | |
" a.href = window.URL.createObjectURL(blob);\n", | |
" a.download = filename;\n", | |
" div.appendChild(a);\n", | |
" a.click();\n", | |
" div.remove();\n", | |
" }\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/javascript": [ | |
"download(\"download_aabbde17-4cb6-460f-a731-aba6dfa72687\", \"reformatted_text.txt\", 43875)" | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": {} | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment