Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Created November 22, 2021 06:47
Show Gist options
  • Save pszemraj/a04056ff80ef4fe00b807d36286ad1d6 to your computer and use it in GitHub Desktop.
Save pszemraj/a04056ff80ef4fe00b807d36286ad1d6 to your computer and use it in GitHub Desktop.
convert whatsapp export to GPT2 script
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "convert whatsapp export to GPT2 script",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/pszemraj/a04056ff80ef4fe00b807d36286ad1d6/convert-whatsapp-export-to-gpt2-script.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "T3gILJgPYbkK"
},
"source": [
"# Converting Whatsapp Exports to GPT-2 Training \n",
"\n",
"- explores / illustrates how to convert a text file that is exported from whatsapp messages to the general `script` format used by `aitextgen` to train a GPT-2 chatbot model\n",
"- note that all names, etc have been replaced in the example text file used here. \n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "9z48V7uCZBZx"
},
"source": [
"## formatting\n",
"\n",
"from IPython.display import HTML, display\n",
"# colab formatting\n",
"def set_css():\n",
" display(\n",
" HTML(\n",
" \"\"\"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" \"\"\"\n",
" )\n",
" )\n",
"\n",
"get_ipython().events.register(\"pre_run_cell\", set_css)"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"cellView": "form",
"id": "06iNiZ7fBn8f",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "01fa6379-63de-4e48-b4f3-e55fbdceb2ad"
},
"source": [
"dl_link = \"https://www.dropbox.com/s/loe823gu7pdra2i/_chat.txt?dl=1\" #@param {type:\"string\"}\n"
],
"execution_count": 2,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "sRe_OcyXBTr2",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "6698ad56-3011-49a9-b823-9ad3fac4a15e"
},
"source": [
"import os \n",
"\n",
"vm_wd = os.getcwd()"
],
"execution_count": 3,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"id": "lIYdn1woOS1n",
"outputId": "5674eadd-f64e-491d-f844-23f14159571d"
},
"source": [
"from urllib import request\n",
"from os.path import join\n",
"\n",
"local_name = join(vm_wd, \"test-text-file.txt\")\n",
"request.urlretrieve(dl_link, local_name)\n"
],
"execution_count": 4,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"('/content/test-text-file.txt', <http.client.HTTPMessage at 0x7f8459779fd0>)"
]
},
"metadata": {},
"execution_count": 4
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "5z0h2pCABuQ-",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "092c15ae-1f2a-4efa-bbb8-c94601e881a3"
},
"source": [
"with open(local_name, 'r', encoding='utf-8', errors='ignore') as f:\n",
" textlines = f.readlines()\n",
"\n",
"print(len(textlines), type(textlines))"
],
"execution_count": 5,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"624 <class 'list'>\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Dh7xHtWfCShN"
},
"source": [
"clean the text"
]
},
{
"cell_type": "code",
"metadata": {
"id": "jrnvT0DDCTxl",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "a548777d-f8b0-4452-9106-531bd21beaa0"
},
"source": [
"%%capture\n",
"!pip install -U clean-text\n",
"from cleantext import clean"
],
"execution_count": 6,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "07rS0sYdCaD8",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "b2812ffb-7e3a-4474-d0b8-23ec320b7f57"
},
"source": [
"textlines = [clean(line) for line in textlines]"
],
"execution_count": 7,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "B6lQIr5DB8kY",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 260
},
"outputId": "0e85b125-1221-4dfe-b1eb-d691450fd377"
},
"source": [
"import pprint as pp\n",
"\n",
"pp.pprint(textlines[:10])\n"
],
"execution_count": 8,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"['[25.02.21, 23:57:58] alexander das great: hi kids',\n",
" '[26.02.21, 06:54:14] jorge: hello friends',\n",
" '[26.02.21, 08:13:32] olga: hi',\n",
" '[26.02.21, 14:18:31] alexander das great: image omitted',\n",
" '[26.02.21, 14:18:33] alexander das great: ayyy',\n",
" '[26.02.21, 14:18:51] alexander das great: when is the first thing due? maybe '\n",
" 'i missed it but not announced right',\n",
" \"[26.02.21, 14:23:47] jorge: task 0 opens monday. i don't think it's for \"\n",
" 'marks tho. task 1a opens monday the week after',\n",
" '[26.02.21, 14:23:55] jorge: looks like 2 weeks to get it done',\n",
" '[26.02.21, 14:34:05] olga: i am following the recordings btw, not the live '\n",
" 'session, because of double sheduling. i will be at the q&a sessions though',\n",
" '[26.02.21, 14:34:45] jorge: also not attending tutorials due to double '\n",
" \"scheduling. i'm sure it'll be fine\"]\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "u4BX2cSiEpDa",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 260
},
"outputId": "072cca15-1afa-4e15-c2a9-fb770797b1e7"
},
"source": [
"import re\n",
"\n",
"re_string = \"\\[([0-9]+(\\.[0-9]+)+), ([0-9]+(:[0-9]+)+)\\] \"\n",
"\n",
"# re.sub(pattern, repl, string, count=0, flags=0)\n",
"\n",
"sub_textlines = [re.sub(re_string, \"\", line) for line in textlines]\n",
"\n",
"\n",
"pp.pprint(sub_textlines[:10])\n"
],
"execution_count": 9,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"['alexander das great: hi kids',\n",
" 'jorge: hello friends',\n",
" 'olga: hi',\n",
" 'alexander das great: image omitted',\n",
" 'alexander das great: ayyy',\n",
" 'alexander das great: when is the first thing due? maybe i missed it but not '\n",
" 'announced right',\n",
" \"jorge: task 0 opens monday. i don't think it's for marks tho. task 1a opens \"\n",
" 'monday the week after',\n",
" 'jorge: looks like 2 weeks to get it done',\n",
" 'olga: i am following the recordings btw, not the live session, because of '\n",
" 'double sheduling. i will be at the q&a sessions though',\n",
" \"jorge: also not attending tutorials due to double scheduling. i'm sure it'll \"\n",
" 'be fine']\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "xNCTigvvGc0p",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 381
},
"outputId": "f4bbb101-34b4-4958-a0cd-6dfff55e6e67"
},
"source": [
"fin_text = []\n",
"\n",
"for line in sub_textlines:\n",
" line = str(line)\n",
" parts = line.split(\": \")\n",
" if len(parts) == 2 and isinstance(parts, list):\n",
" fin_text.append(parts[0] + \":\\n\")\n",
" fin_text.append(parts[1] + \"\\n\")\n",
" fin_text.append(\"\\n\")\n",
" elif len(parts) > 2:\n",
" fin_text.append(parts[0] + \":\\n\")\n",
" fin_text.append(\" \".join(parts[1:]) + \"\\n\")\n",
" fin_text.append(\"\\n\")\n",
" else:\n",
" continue\n",
"\n",
"\n",
"\n",
"pp.pprint(fin_text[:20])"
],
"execution_count": 10,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"['alexander das great:\\n',\n",
" 'hi kids\\n',\n",
" '\\n',\n",
" 'jorge:\\n',\n",
" 'hello friends\\n',\n",
" '\\n',\n",
" 'olga:\\n',\n",
" 'hi\\n',\n",
" '\\n',\n",
" 'alexander das great:\\n',\n",
" 'image omitted\\n',\n",
" '\\n',\n",
" 'alexander das great:\\n',\n",
" 'ayyy\\n',\n",
" '\\n',\n",
" 'alexander das great:\\n',\n",
" 'when is the first thing due? maybe i missed it but not announced right\\n',\n",
" '\\n',\n",
" 'jorge:\\n',\n",
" \"task 0 opens monday. i don't think it's for marks tho. task 1a opens monday \"\n",
" 'the week after\\n']\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UiaxKXigIOmY"
},
"source": [
"# save\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "oZyYvlSJIPyN",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "9f94efeb-602b-45dc-d7a9-4548ea65e0ca"
},
"source": [
"outname = \"reformatted_text.txt\"\n",
"with open(outname, 'w', encoding='utf-8', errors='ignore') as fo:\n",
"\n",
" fo.writelines(fin_text)\n",
"\n",
"from google.colab import files\n",
"\n",
"files.download(outname)"
],
"execution_count": 11,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" <style>\n",
" pre {\n",
" white-space: pre-wrap;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/javascript": [
"\n",
" async function download(id, filename, size) {\n",
" if (!google.colab.kernel.accessAllowed) {\n",
" return;\n",
" }\n",
" const div = document.createElement('div');\n",
" const label = document.createElement('label');\n",
" label.textContent = `Downloading \"${filename}\": `;\n",
" div.appendChild(label);\n",
" const progress = document.createElement('progress');\n",
" progress.max = size;\n",
" div.appendChild(progress);\n",
" document.body.appendChild(div);\n",
"\n",
" const buffers = [];\n",
" let downloaded = 0;\n",
"\n",
" const channel = await google.colab.kernel.comms.open(id);\n",
" // Send a message to notify the kernel that we're ready.\n",
" channel.send({})\n",
"\n",
" for await (const message of channel.messages) {\n",
" // Send a message to notify the kernel that we're ready.\n",
" channel.send({})\n",
" if (message.buffers) {\n",
" for (const buffer of message.buffers) {\n",
" buffers.push(buffer);\n",
" downloaded += buffer.byteLength;\n",
" progress.value = downloaded;\n",
" }\n",
" }\n",
" }\n",
" const blob = new Blob(buffers, {type: 'application/binary'});\n",
" const a = document.createElement('a');\n",
" a.href = window.URL.createObjectURL(blob);\n",
" a.download = filename;\n",
" div.appendChild(a);\n",
" a.click();\n",
" div.remove();\n",
" }\n",
" "
],
"text/plain": [
"<IPython.core.display.Javascript object>"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/javascript": [
"download(\"download_aabbde17-4cb6-460f-a731-aba6dfa72687\", \"reformatted_text.txt\", 43875)"
],
"text/plain": [
"<IPython.core.display.Javascript object>"
]
},
"metadata": {}
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment