pszemraj/convert-whatsapp-export-to-gpt2-script.ipynb

## convert-whatsapp-export-to-gpt2-script.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "convert whatsapp export to GPT2 script",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/pszemraj/a04056ff80ef4fe00b807d36286ad1d6/convert-whatsapp-export-to-gpt2-script.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "T3gILJgPYbkK"
      },
      "source": [
        "# Converting Whatsapp Exports to GPT-2 Training \n",
        "\n",
        "- explores / illustrates how to convert a text file that is exported from whatsapp messages to the general `script` format used by `aitextgen` to train a GPT-2 chatbot model\n",
        "- note that all names, etc have been replaced in the example text file used here. \n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9z48V7uCZBZx"
      },
      "source": [
        "## formatting\n",
        "\n",
        "from IPython.display import HTML, display\n",
        "# colab formatting\n",
        "def set_css():\n",
        "    display(\n",
        "        HTML(\n",
        "            \"\"\"\n",
        "  <style>\n",
        "    pre {\n",
        "        white-space: pre-wrap;\n",
        "    }\n",
        "  </style>\n",
        "  \"\"\"\n",
        "        )\n",
        "    )\n",
        "\n",
        "get_ipython().events.register(\"pre_run_cell\", set_css)"
      ],
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "cellView": "form",
        "id": "06iNiZ7fBn8f",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "01fa6379-63de-4e48-b4f3-e55fbdceb2ad"
      },
      "source": [
        "dl_link = \"https://www.dropbox.com/s/loe823gu7pdra2i/_chat.txt?dl=1\" #@param {type:\"string\"}\n"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "sRe_OcyXBTr2",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "6698ad56-3011-49a9-b823-9ad3fac4a15e"
      },
      "source": [
        "import os \n",
        "\n",
        "vm_wd = os.getcwd()"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "id": "lIYdn1woOS1n",
        "outputId": "5674eadd-f64e-491d-f844-23f14159571d"
      },
      "source": [
        "from urllib import request\n",
        "from os.path import join\n",
        "\n",
        "local_name = join(vm_wd, \"test-text-file.txt\")\n",
        "request.urlretrieve(dl_link, local_name)\n"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "('/content/test-text-file.txt', <http.client.HTTPMessage at 0x7f8459779fd0>)"
            ]
          },
          "metadata": {},
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5z0h2pCABuQ-",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "092c15ae-1f2a-4efa-bbb8-c94601e881a3"
      },
      "source": [
        "with open(local_name, 'r', encoding='utf-8', errors='ignore') as f:\n",
        "    textlines = f.readlines()\n",
        "\n",
        "print(len(textlines), type(textlines))"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "624 <class 'list'>\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Dh7xHtWfCShN"
      },
      "source": [
        "clean the text"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jrnvT0DDCTxl",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "a548777d-f8b0-4452-9106-531bd21beaa0"
      },
      "source": [
        "%%capture\n",
        "!pip install -U clean-text\n",
        "from cleantext import clean"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "07rS0sYdCaD8",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "b2812ffb-7e3a-4474-d0b8-23ec320b7f57"
      },
      "source": [
        "textlines = [clean(line) for line in textlines]"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "B6lQIr5DB8kY",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 260
        },
        "outputId": "0e85b125-1221-4dfe-b1eb-d691450fd377"
      },
      "source": [
        "import pprint as pp\n",
        "\n",
        "pp.pprint(textlines[:10])\n"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "['[25.02.21, 23:57:58] alexander das great: hi kids',\n",
            " '[26.02.21, 06:54:14] jorge: hello friends',\n",
            " '[26.02.21, 08:13:32] olga: hi',\n",
            " '[26.02.21, 14:18:31] alexander das great: image omitted',\n",
            " '[26.02.21, 14:18:33] alexander das great: ayyy',\n",
            " '[26.02.21, 14:18:51] alexander das great: when is the first thing due? maybe '\n",
            " 'i missed it but not announced right',\n",
            " \"[26.02.21, 14:23:47] jorge: task 0 opens monday. i don't think it's for \"\n",
            " 'marks tho. task 1a opens monday the week after',\n",
            " '[26.02.21, 14:23:55] jorge: looks like 2 weeks to get it done',\n",
            " '[26.02.21, 14:34:05] olga: i am following the recordings btw, not the live '\n",
            " 'session, because of double sheduling. i will be at the q&a sessions though',\n",
            " '[26.02.21, 14:34:45] jorge: also not attending tutorials due to double '\n",
            " \"scheduling. i'm sure it'll be fine\"]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "u4BX2cSiEpDa",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 260
        },
        "outputId": "072cca15-1afa-4e15-c2a9-fb770797b1e7"
      },
      "source": [
        "import re\n",
        "\n",
        "re_string = \"\\[([0-9]+(\\.[0-9]+)+), ([0-9]+(:[0-9]+)+)\\] \"\n",
        "\n",
        "# re.sub(pattern, repl, string, count=0, flags=0)\n",
        "\n",
        "sub_textlines = [re.sub(re_string, \"\", line) for line in textlines]\n",
        "\n",
        "\n",
        "pp.pprint(sub_textlines[:10])\n"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "['alexander das great: hi kids',\n",
            " 'jorge: hello friends',\n",
            " 'olga: hi',\n",
            " 'alexander das great: image omitted',\n",
            " 'alexander das great: ayyy',\n",
            " 'alexander das great: when is the first thing due? maybe i missed it but not '\n",
            " 'announced right',\n",
            " \"jorge: task 0 opens monday. i don't think it's for marks tho. task 1a opens \"\n",
            " 'monday the week after',\n",
            " 'jorge: looks like 2 weeks to get it done',\n",
            " 'olga: i am following the recordings btw, not the live session, because of '\n",
            " 'double sheduling. i will be at the q&a sessions though',\n",
            " \"jorge: also not attending tutorials due to double scheduling. i'm sure it'll \"\n",
            " 'be fine']\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xNCTigvvGc0p",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 381
        },
        "outputId": "f4bbb101-34b4-4958-a0cd-6dfff55e6e67"
      },
      "source": [
        "fin_text = []\n",
        "\n",
        "for line in sub_textlines:\n",
        "    line = str(line)\n",
        "    parts = line.split(\": \")\n",
        "    if len(parts) == 2 and isinstance(parts, list):\n",
        "        fin_text.append(parts[0] + \":\\n\")\n",
        "        fin_text.append(parts[1] + \"\\n\")\n",
        "        fin_text.append(\"\\n\")\n",
        "    elif len(parts) > 2:\n",
        "        fin_text.append(parts[0] + \":\\n\")\n",
        "        fin_text.append(\" \".join(parts[1:]) + \"\\n\")\n",
        "        fin_text.append(\"\\n\")\n",
        "    else:\n",
        "        continue\n",
        "\n",
        "\n",
        "\n",
        "pp.pprint(fin_text[:20])"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "['alexander das great:\\n',\n",
            " 'hi kids\\n',\n",
            " '\\n',\n",
            " 'jorge:\\n',\n",
            " 'hello friends\\n',\n",
            " '\\n',\n",
            " 'olga:\\n',\n",
            " 'hi\\n',\n",
            " '\\n',\n",
            " 'alexander das great:\\n',\n",
            " 'image omitted\\n',\n",
            " '\\n',\n",
            " 'alexander das great:\\n',\n",
            " 'ayyy\\n',\n",
            " '\\n',\n",
            " 'alexander das great:\\n',\n",
            " 'when is the first thing due? maybe i missed it but not announced right\\n',\n",
            " '\\n',\n",
            " 'jorge:\\n',\n",
            " \"task 0 opens monday. i don't think it's for marks tho. task 1a opens monday \"\n",
            " 'the week after\\n']\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UiaxKXigIOmY"
      },
      "source": [
        "# save\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oZyYvlSJIPyN",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "outputId": "9f94efeb-602b-45dc-d7a9-4548ea65e0ca"
      },
      "source": [
        "outname = \"reformatted_text.txt\"\n",
        "with open(outname, 'w', encoding='utf-8', errors='ignore') as fo:\n",
        "\n",
        "    fo.writelines(fin_text)\n",
        "\n",
        "from google.colab import files\n",
        "\n",
        "files.download(outname)"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "application/javascript": [
              "download(\"download_aabbde17-4cb6-460f-a731-aba6dfa72687\", \"reformatted_text.txt\", 43875)"
            ],
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ]
          },
          "metadata": {}
        }
      ]
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "convert whatsapp export to GPT2 script",
	"provenance": [],
	"collapsed_sections": [],
	"include_colab_link": true
	},
	"kernelspec": {
	"display_name": "Python 3",
	"name": "python3"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/pszemraj/a04056ff80ef4fe00b807d36286ad1d6/convert-whatsapp-export-to-gpt2-script.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "T3gILJgPYbkK"
	},
	"source": [
	"# Converting Whatsapp Exports to GPT-2 Training \n",
	"\n",
	"- explores / illustrates how to convert a text file that is exported from whatsapp messages to the general `script` format used by `aitextgen` to train a GPT-2 chatbot model\n",
	"- note that all names, etc have been replaced in the example text file used here. \n"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "9z48V7uCZBZx"
	},
	"source": [
	"## formatting\n",
	"\n",
	"from IPython.display import HTML, display\n",
	"# colab formatting\n",
	"def set_css():\n",
	" display(\n",
	" HTML(\n",
	" \"\"\"\n",
	" <style>\n",
	" pre {\n",
	" white-space: pre-wrap;\n",
	" }\n",
	" </style>\n",
	" \"\"\"\n",
	" )\n",
	" )\n",
	"\n",
	"get_ipython().events.register(\"pre_run_cell\", set_css)"
	],
	"execution_count": 1,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"cellView": "form",
	"id": "06iNiZ7fBn8f",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 17
	},
	"outputId": "01fa6379-63de-4e48-b4f3-e55fbdceb2ad"
	},
	"source": [
	"dl_link = \"https://www.dropbox.com/s/loe823gu7pdra2i/_chat.txt?dl=1\" #@param {type:\"string\"}\n"
	],
	"execution_count": 2,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/html": [
	"\n",
	" <style>\n",
	" pre {\n",
	" white-space: pre-wrap;\n",
	" }\n",
	" </style>\n",
	" "
	],
	"text/plain": [
	"<IPython.core.display.HTML object>"
	]
	},
	"metadata": {}
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "sRe_OcyXBTr2",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 17
	},
	"outputId": "6698ad56-3011-49a9-b823-9ad3fac4a15e"
	},
	"source": [
	"import os \n",
	"\n",
	"vm_wd = os.getcwd()"
	],
	"execution_count": 3,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/html": [
	"\n",
	" <style>\n",
	" pre {\n",
	" white-space: pre-wrap;\n",
	" }\n",
	" </style>\n",
	" "
	],
	"text/plain": [
	"<IPython.core.display.HTML object>"
	]
	},
	"metadata": {}
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 34
	},
	"id": "lIYdn1woOS1n",
	"outputId": "5674eadd-f64e-491d-f844-23f14159571d"
	},
	"source": [
	"from urllib import request\n",
	"from os.path import join\n",
	"\n",
	"local_name = join(vm_wd, \"test-text-file.txt\")\n",
	"request.urlretrieve(dl_link, local_name)\n"
	],
	"execution_count": 4,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/html": [
	"\n",
	" <style>\n",
	" pre {\n",
	" white-space: pre-wrap;\n",
	" }\n",
	" </style>\n",
	" "
	],
	"text/plain": [
	"<IPython.core.display.HTML object>"
	]
	},
	"metadata": {}
	},
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"('/content/test-text-file.txt', <http.client.HTTPMessage at 0x7f8459779fd0>)"
	]
	},
	"metadata": {},
	"execution_count": 4
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "5z0h2pCABuQ-",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 34
	},
	"outputId": "092c15ae-1f2a-4efa-bbb8-c94601e881a3"
	},
	"source": [
	"with open(local_name, 'r', encoding='utf-8', errors='ignore') as f:\n",
	" textlines = f.readlines()\n",
	"\n",
	"print(len(textlines), type(textlines))"
	],
	"execution_count": 5,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/html": [
	"\n",
	" <style>\n",
	" pre {\n",
	" white-space: pre-wrap;\n",
	" }\n",
	" </style>\n",
	" "
	],
	"text/plain": [
	"<IPython.core.display.HTML object>"
	]
	},
	"metadata": {}
	},
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"624 <class 'list'>\n"
	]
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "Dh7xHtWfCShN"
	},
	"source": [
	"clean the text"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "jrnvT0DDCTxl",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 17
	},
	"outputId": "a548777d-f8b0-4452-9106-531bd21beaa0"
	},
	"source": [
	"%%capture\n",
	"!pip install -U clean-text\n",
	"from cleantext import clean"
	],
	"execution_count": 6,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/html": [
	"\n",
	" <style>\n",
	" pre {\n",
	" white-space: pre-wrap;\n",
	" }\n",
	" </style>\n",
	" "
	],
	"text/plain": [
	"<IPython.core.display.HTML object>"
	]
	},
	"metadata": {}
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "07rS0sYdCaD8",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 17
	},
	"outputId": "b2812ffb-7e3a-4474-d0b8-23ec320b7f57"
	},
	"source": [
	"textlines = [clean(line) for line in textlines]"
	],
	"execution_count": 7,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/html": [
	"\n",
	" <style>\n",
	" pre {\n",
	" white-space: pre-wrap;\n",
	" }\n",
	" </style>\n",
	" "
	],
	"text/plain": [
	"<IPython.core.display.HTML object>"
	]
	},
	"metadata": {}
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "B6lQIr5DB8kY",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 260
	},
	"outputId": "0e85b125-1221-4dfe-b1eb-d691450fd377"
	},
	"source": [
	"import pprint as pp\n",
	"\n",
	"pp.pprint(textlines[:10])\n"
	],
	"execution_count": 8,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/html": [
	"\n",
	" <style>\n",
	" pre {\n",
	" white-space: pre-wrap;\n",
	" }\n",
	" </style>\n",
	" "
	],
	"text/plain": [
	"<IPython.core.display.HTML object>"
	]
	},
	"metadata": {}
	},
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"['[25.02.21, 23:57:58] alexander das great: hi kids',\n",
	" '[26.02.21, 06:54:14] jorge: hello friends',\n",
	" '[26.02.21, 08:13:32] olga: hi',\n",
	" '[26.02.21, 14:18:31] alexander das great: image omitted',\n",
	" '[26.02.21, 14:18:33] alexander das great: ayyy',\n",
	" '[26.02.21, 14:18:51] alexander das great: when is the first thing due? maybe '\n",
	" 'i missed it but not announced right',\n",
	" \"[26.02.21, 14:23:47] jorge: task 0 opens monday. i don't think it's for \"\n",
	" 'marks tho. task 1a opens monday the week after',\n",
	" '[26.02.21, 14:23:55] jorge: looks like 2 weeks to get it done',\n",
	" '[26.02.21, 14:34:05] olga: i am following the recordings btw, not the live '\n",
	" 'session, because of double sheduling. i will be at the q&a sessions though',\n",
	" '[26.02.21, 14:34:45] jorge: also not attending tutorials due to double '\n",
	" \"scheduling. i'm sure it'll be fine\"]\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "u4BX2cSiEpDa",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 260
	},
	"outputId": "072cca15-1afa-4e15-c2a9-fb770797b1e7"
	},
	"source": [
	"import re\n",
	"\n",
	"re_string = \"\\[([0-9]+(\\.[0-9]+)+), ([0-9]+(:[0-9]+)+)\\] \"\n",
	"\n",
	"# re.sub(pattern, repl, string, count=0, flags=0)\n",
	"\n",
	"sub_textlines = [re.sub(re_string, \"\", line) for line in textlines]\n",
	"\n",
	"\n",
	"pp.pprint(sub_textlines[:10])\n"
	],
	"execution_count": 9,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/html": [
	"\n",
	" <style>\n",
	" pre {\n",
	" white-space: pre-wrap;\n",
	" }\n",
	" </style>\n",
	" "
	],
	"text/plain": [
	"<IPython.core.display.HTML object>"
	]
	},
	"metadata": {}
	},
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"['alexander das great: hi kids',\n",
	" 'jorge: hello friends',\n",
	" 'olga: hi',\n",
	" 'alexander das great: image omitted',\n",
	" 'alexander das great: ayyy',\n",
	" 'alexander das great: when is the first thing due? maybe i missed it but not '\n",
	" 'announced right',\n",
	" \"jorge: task 0 opens monday. i don't think it's for marks tho. task 1a opens \"\n",
	" 'monday the week after',\n",
	" 'jorge: looks like 2 weeks to get it done',\n",
	" 'olga: i am following the recordings btw, not the live session, because of '\n",
	" 'double sheduling. i will be at the q&a sessions though',\n",
	" \"jorge: also not attending tutorials due to double scheduling. i'm sure it'll \"\n",
	" 'be fine']\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "xNCTigvvGc0p",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 381
	},
	"outputId": "f4bbb101-34b4-4958-a0cd-6dfff55e6e67"
	},
	"source": [
	"fin_text = []\n",
	"\n",
	"for line in sub_textlines:\n",
	" line = str(line)\n",
	" parts = line.split(\": \")\n",
	" if len(parts) == 2 and isinstance(parts, list):\n",
	" fin_text.append(parts[0] + \":\\n\")\n",
	" fin_text.append(parts[1] + \"\\n\")\n",
	" fin_text.append(\"\\n\")\n",
	" elif len(parts) > 2:\n",
	" fin_text.append(parts[0] + \":\\n\")\n",
	" fin_text.append(\" \".join(parts[1:]) + \"\\n\")\n",
	" fin_text.append(\"\\n\")\n",
	" else:\n",
	" continue\n",
	"\n",
	"\n",
	"\n",
	"pp.pprint(fin_text[:20])"
	],
	"execution_count": 10,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/html": [
	"\n",
	" <style>\n",
	" pre {\n",
	" white-space: pre-wrap;\n",
	" }\n",
	" </style>\n",
	" "
	],
	"text/plain": [
	"<IPython.core.display.HTML object>"
	]
	},
	"metadata": {}
	},
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"['alexander das great:\\n',\n",
	" 'hi kids\\n',\n",
	" '\\n',\n",
	" 'jorge:\\n',\n",
	" 'hello friends\\n',\n",
	" '\\n',\n",
	" 'olga:\\n',\n",
	" 'hi\\n',\n",
	" '\\n',\n",
	" 'alexander das great:\\n',\n",
	" 'image omitted\\n',\n",
	" '\\n',\n",
	" 'alexander das great:\\n',\n",
	" 'ayyy\\n',\n",
	" '\\n',\n",
	" 'alexander das great:\\n',\n",
	" 'when is the first thing due? maybe i missed it but not announced right\\n',\n",
	" '\\n',\n",
	" 'jorge:\\n',\n",
	" \"task 0 opens monday. i don't think it's for marks tho. task 1a opens monday \"\n",
	" 'the week after\\n']\n"
	]
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "UiaxKXigIOmY"
	},
	"source": [
	"# save\n"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "oZyYvlSJIPyN",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 17
	},
	"outputId": "9f94efeb-602b-45dc-d7a9-4548ea65e0ca"
	},
	"source": [
	"outname = \"reformatted_text.txt\"\n",
	"with open(outname, 'w', encoding='utf-8', errors='ignore') as fo:\n",
	"\n",
	" fo.writelines(fin_text)\n",
	"\n",
	"from google.colab import files\n",
	"\n",
	"files.download(outname)"
	],
	"execution_count": 11,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/html": [
	"\n",
	" <style>\n",
	" pre {\n",
	" white-space: pre-wrap;\n",
	" }\n",
	" </style>\n",
	" "
	],
	"text/plain": [
	"<IPython.core.display.HTML object>"
	]
	},
	"metadata": {}
	},
	{
	"output_type": "display_data",
	"data": {
	"application/javascript": [
	"\n",
	" async function download(id, filename, size) {\n",
	" if (!google.colab.kernel.accessAllowed) {\n",
	" return;\n",
	" }\n",
	" const div = document.createElement('div');\n",
	" const label = document.createElement('label');\n",
	" label.textContent = `Downloading \"${filename}\": `;\n",
	" div.appendChild(label);\n",
	" const progress = document.createElement('progress');\n",
	" progress.max = size;\n",
	" div.appendChild(progress);\n",
	" document.body.appendChild(div);\n",
	"\n",
	" const buffers = [];\n",
	" let downloaded = 0;\n",
	"\n",
	" const channel = await google.colab.kernel.comms.open(id);\n",
	" // Send a message to notify the kernel that we're ready.\n",
	" channel.send({})\n",
	"\n",
	" for await (const message of channel.messages) {\n",
	" // Send a message to notify the kernel that we're ready.\n",
	" channel.send({})\n",
	" if (message.buffers) {\n",
	" for (const buffer of message.buffers) {\n",
	" buffers.push(buffer);\n",
	" downloaded += buffer.byteLength;\n",
	" progress.value = downloaded;\n",
	" }\n",
	" }\n",
	" }\n",
	" const blob = new Blob(buffers, {type: 'application/binary'});\n",
	" const a = document.createElement('a');\n",
	" a.href = window.URL.createObjectURL(blob);\n",
	" a.download = filename;\n",
	" div.appendChild(a);\n",
	" a.click();\n",
	" div.remove();\n",
	" }\n",
	" "
	],
	"text/plain": [
	"<IPython.core.display.Javascript object>"
	]
	},
	"metadata": {}
	},
	{
	"output_type": "display_data",
	"data": {
	"application/javascript": [
	"download(\"download_aabbde17-4cb6-460f-a731-aba6dfa72687\", \"reformatted_text.txt\", 43875)"
	],
	"text/plain": [
	"<IPython.core.display.Javascript object>"
	]
	},
	"metadata": {}
	}
	]
	}
	]
	}