ZhiyaoShu/tweet_sentiment_extraction_nlp.ipynb

## tweet_sentiment_extraction_nlp.ipynb
{
  "metadata": {
    "kernelspec": {
      "language": "python",
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.10.12",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "kaggle": {
      "accelerator": "none",
      "dataSources": [
        {
          "sourceId": 16295,
          "databundleVersionId": 1099992,
          "sourceType": "competition"
        }
      ],
      "dockerImageVersionId": 30615,
      "isInternetEnabled": true,
      "language": "python",
      "sourceType": "notebook",
      "isGpuEnabled": false
    },
    "colab": {
      "provenance": [],
      "include_colab_link": true
    }
  },
  "nbformat_minor": 0,
  "nbformat": 4,
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/ZhiyaoShu/bba0a711f2f4b8f368c453dc9ae07641/tweet_sentiment_extraction_nlp.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install -q kaggle"
      ],
      "metadata": {
        "id": "tEsX1rtj0svh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/"
      ],
      "metadata": {
        "id": "IEX4Y7n81pbZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "! chmod 600 ~/.kaggle/kaggle.json"
      ],
      "metadata": {
        "id": "qeVgGSVS3iDU"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "! kaggle datasets list"
      ],
      "metadata": {
        "id": "v_K6hdnu3jwB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "\n",
        "import os\n",
        "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
        "    for filename in filenames:\n",
        "        print(os.path.join(dirname, filename))\n"
      ],
      "metadata": {
        "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
        "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
        "execution": {
          "iopub.status.busy": "2023-12-09T03:03:55.911081Z",
          "iopub.execute_input": "2023-12-09T03:03:55.912258Z",
          "iopub.status.idle": "2023-12-09T03:03:56.443447Z",
          "shell.execute_reply.started": "2023-12-09T03:03:55.912211Z",
          "shell.execute_reply": "2023-12-09T03:03:56.441627Z"
        },
        "trusted": true,
        "id": "BCLtIvzkmh6U"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dMnUsKzuHHgq",
        "outputId": "9b1f2c09-f65d-4f84-b802-0b4938a88ef4"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Intro: This is a practice of social media sentiment extraction"
      ],
      "metadata": {
        "id": "dukkVz6omh6X"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import tensorflow as tf\n",
        "print(tf.__version__)"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2023-12-09T03:05:31.877064Z",
          "iopub.execute_input": "2023-12-09T03:05:31.877820Z",
          "iopub.status.idle": "2023-12-09T03:05:51.675825Z",
          "shell.execute_reply.started": "2023-12-09T03:05:31.877784Z",
          "shell.execute_reply": "2023-12-09T03:05:51.674840Z"
        },
        "trusted": true,
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "wmxvLE-ymh6Y",
        "outputId": "08cc362d-d7eb-466c-d359-982e77055a2f"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "2.15.0\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "! kaggle competitions download -c tweet-sentiment-extraction"
      ],
      "metadata": {
        "id": "gPNHxpMH3rXS",
        "outputId": "23771922-368c-4770-fb45-c3149f4af9fd",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Downloading tweet-sentiment-extraction.zip to /content\n",
            "\r  0% 0.00/1.39M [00:00<?, ?B/s]\n",
            "\r100% 1.39M/1.39M [00:00<00:00, 133MB/s]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "! unzip tweet-sentiment-extraction.zip"
      ],
      "metadata": {
        "id": "ENI-yrZi4CUt",
        "outputId": "d0c64924-1e65-4749-e858-e898fa75ec7b",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Archive:  tweet-sentiment-extraction.zip\n",
            "  inflating: sample_submission.csv   \n",
            "  inflating: test.csv                \n",
            "  inflating: train.csv               \n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "test_data = pd.read_csv(\"/content/test.csv\")\n",
        "train_data = pd.read_csv(\"/content/train.csv\")\n",
        "\n",
        "test_data.head()\n",
        "train_data.head()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2023-12-09T03:05:51.887897Z",
          "iopub.execute_input": "2023-12-09T03:05:51.888351Z",
          "iopub.status.idle": "2023-12-09T03:05:52.026869Z",
          "shell.execute_reply.started": "2023-12-09T03:05:51.888316Z",
          "shell.execute_reply": "2023-12-09T03:05:52.025505Z"
        },
        "trusted": true,
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 258
        },
        "id": "yfvBvkeGmh6Y",
        "outputId": "eb8b2cf9-1e40-4aba-c928-ef449a93cb73"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "       textID                                               text  \\\n",
              "0  cb774db0d1                I`d have responded, if I were going   \n",
              "1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   \n",
              "2  088c60f138                          my boss is bullying me...   \n",
              "3  9642c003ef                     what interview! leave me alone   \n",
              "4  358bd9e861   Sons of ****, why couldn`t they put them on t...   \n",
              "\n",
              "                         selected_text sentiment  \n",
              "0  I`d have responded, if I were going   neutral  \n",
              "1                             Sooo SAD  negative  \n",
              "2                          bullying me  negative  \n",
              "3                       leave me alone  negative  \n",
              "4                        Sons of ****,  negative  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-7b4501f3-e383-4523-881d-b8fc1dd19732\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>textID</th>\n",
              "      <th>text</th>\n",
              "      <th>selected_text</th>\n",
              "      <th>sentiment</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>cb774db0d1</td>\n",
              "      <td>I`d have responded, if I were going</td>\n",
              "      <td>I`d have responded, if I were going</td>\n",
              "      <td>neutral</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>549e992a42</td>\n",
              "      <td>Sooo SAD I will miss you here in San Diego!!!</td>\n",
              "      <td>Sooo SAD</td>\n",
              "      <td>negative</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>088c60f138</td>\n",
              "      <td>my boss is bullying me...</td>\n",
              "      <td>bullying me</td>\n",
              "      <td>negative</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>9642c003ef</td>\n",
              "      <td>what interview! leave me alone</td>\n",
              "      <td>leave me alone</td>\n",
              "      <td>negative</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>358bd9e861</td>\n",
              "      <td>Sons of ****, why couldn`t they put them on t...</td>\n",
              "      <td>Sons of ****,</td>\n",
              "      <td>negative</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-7b4501f3-e383-4523-881d-b8fc1dd19732')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-7b4501f3-e383-4523-881d-b8fc1dd19732 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-7b4501f3-e383-4523-881d-b8fc1dd19732');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-7f93d536-0713-47f8-a2bd-a22ba0017957\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-7f93d536-0713-47f8-a2bd-a22ba0017957')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-7f93d536-0713-47f8-a2bd-a22ba0017957 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "train_data",
              "summary": "{\n  \"name\": \"train_data\",\n  \"rows\": 27481,\n  \"fields\": [\n    {\n      \"column\": \"textID\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 27481,\n        \"samples\": [\n          \"a7f72a928a\",\n          \"ef42dee96c\",\n          \"07d17131b1\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 27480,\n        \"samples\": [\n          \" Enjoy! Family trumps everything\",\n          \" --of them kinda turns me off of it all.  And then I buy more of them and dig a deeper hole, etc. ;;\",\n          \"Clive it`s my birthday pat me  http://apps.facebook.com/dogbook/profile/view/6386106\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"selected_text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 22463,\n        \"samples\": [\n          \"we win\",\n          \"YES!!! haahaaa.! break out the jellybeaniesss!\",\n          \"hay wats ur AIM? we should chat\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"sentiment\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"neutral\",\n          \"negative\",\n          \"positive\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 23
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Check basic information\n",
        "train_data.describe()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2023-12-09T03:06:18.849022Z",
          "iopub.execute_input": "2023-12-09T03:06:18.849502Z",
          "iopub.status.idle": "2023-12-09T03:06:18.943085Z",
          "shell.execute_reply.started": "2023-12-09T03:06:18.849465Z",
          "shell.execute_reply": "2023-12-09T03:06:18.941595Z"
        },
        "trusted": true,
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 175
        },
        "id": "LGQvMS65mh6Y",
        "outputId": "e2ef44af-d069-4bb1-99f2-563bc35ec443"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "            textID                                  text selected_text  \\\n",
              "count        27481                                 27480         27480   \n",
              "unique       27481                                 27480         22463   \n",
              "top     cb774db0d1   I`d have responded, if I were going          good   \n",
              "freq             1                                     1           199   \n",
              "\n",
              "       sentiment  \n",
              "count      27481  \n",
              "unique         3  \n",
              "top      neutral  \n",
              "freq       11118  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-ad0427ca-cb16-43e0-82bb-b17a3e70e007\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>textID</th>\n",
              "      <th>text</th>\n",
              "      <th>selected_text</th>\n",
              "      <th>sentiment</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>27481</td>\n",
              "      <td>27480</td>\n",
              "      <td>27480</td>\n",
              "      <td>27481</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>unique</th>\n",
              "      <td>27481</td>\n",
              "      <td>27480</td>\n",
              "      <td>22463</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>top</th>\n",
              "      <td>cb774db0d1</td>\n",
              "      <td>I`d have responded, if I were going</td>\n",
              "      <td>good</td>\n",
              "      <td>neutral</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>freq</th>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>199</td>\n",
              "      <td>11118</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-ad0427ca-cb16-43e0-82bb-b17a3e70e007')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-ad0427ca-cb16-43e0-82bb-b17a3e70e007 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-ad0427ca-cb16-43e0-82bb-b17a3e70e007');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-eb23c673-c67d-4a2b-ab0e-fae860d855eb\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-eb23c673-c67d-4a2b-ab0e-fae860d855eb')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-eb23c673-c67d-4a2b-ab0e-fae860d855eb button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"train_data\",\n  \"rows\": 4,\n  \"fields\": [\n    {\n      \"column\": \"textID\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"27481\",\n          \"cb774db0d1\",\n          \"1\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"27480\",\n          \" I`d have responded, if I were going\",\n          \"1\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"selected_text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 4,\n        \"samples\": [\n          22463,\n          \"199\",\n          \"27480\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"sentiment\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 4,\n        \"samples\": [\n          3,\n          \"11118\",\n          \"27481\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 24
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "test_data.describe()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2023-12-09T03:06:22.059012Z",
          "iopub.execute_input": "2023-12-09T03:06:22.059505Z",
          "iopub.status.idle": "2023-12-09T03:06:22.087533Z",
          "shell.execute_reply.started": "2023-12-09T03:06:22.059474Z",
          "shell.execute_reply": "2023-12-09T03:06:22.086136Z"
        },
        "trusted": true,
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 175
        },
        "id": "ziO__MDtmh6Z",
        "outputId": "d4ed2938-e7de-4902-cf06-810dbb86a567"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "            textID                                               text  \\\n",
              "count         3534                                               3534   \n",
              "unique        3534                                               3534   \n",
              "top     f87dea47db  Last session of the day  http://twitpic.com/67ezh   \n",
              "freq             1                                                  1   \n",
              "\n",
              "       sentiment  \n",
              "count       3534  \n",
              "unique         3  \n",
              "top      neutral  \n",
              "freq        1430  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-f2470ee3-a39b-4d22-8486-884b850c3405\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>textID</th>\n",
              "      <th>text</th>\n",
              "      <th>sentiment</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>3534</td>\n",
              "      <td>3534</td>\n",
              "      <td>3534</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>unique</th>\n",
              "      <td>3534</td>\n",
              "      <td>3534</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>top</th>\n",
              "      <td>f87dea47db</td>\n",
              "      <td>Last session of the day  http://twitpic.com/67ezh</td>\n",
              "      <td>neutral</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>freq</th>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>1430</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f2470ee3-a39b-4d22-8486-884b850c3405')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-f2470ee3-a39b-4d22-8486-884b850c3405 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-f2470ee3-a39b-4d22-8486-884b850c3405');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-27dac3cc-b7f4-4396-acdb-6ee946af87e7\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-27dac3cc-b7f4-4396-acdb-6ee946af87e7')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-27dac3cc-b7f4-4396-acdb-6ee946af87e7 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"test_data\",\n  \"rows\": 4,\n  \"fields\": [\n    {\n      \"column\": \"textID\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"3534\",\n          \"f87dea47db\",\n          \"1\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"3534\",\n          \"Last session of the day  http://twitpic.com/67ezh\",\n          \"1\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"sentiment\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 4,\n        \"samples\": [\n          3,\n          \"1430\",\n          \"3534\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 25
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Check for null values of train data\n",
        "train_data.isna().sum()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2023-12-09T03:06:02.642708Z",
          "iopub.execute_input": "2023-12-09T03:06:02.643173Z",
          "iopub.status.idle": "2023-12-09T03:06:02.666775Z",
          "shell.execute_reply.started": "2023-12-09T03:06:02.643140Z",
          "shell.execute_reply": "2023-12-09T03:06:02.665157Z"
        },
        "trusted": true,
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "sW_rLEvTmh6Z",
        "outputId": "f5974af5-e699-4372-d4de-32d330e96737"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "textID           0\n",
              "text             1\n",
              "selected_text    1\n",
              "sentiment        0\n",
              "dtype: int64"
            ]
          },
          "metadata": {},
          "execution_count": 26
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Drop null values\n",
        "train_data.dropna(inplace=True)"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2023-12-09T03:06:34.309793Z",
          "iopub.execute_input": "2023-12-09T03:06:34.310425Z",
          "iopub.status.idle": "2023-12-09T03:06:34.336322Z",
          "shell.execute_reply.started": "2023-12-09T03:06:34.310375Z",
          "shell.execute_reply": "2023-12-09T03:06:34.334742Z"
        },
        "trusted": true,
        "id": "BYth5ceQmh6Z"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Check for null values of test data\n",
        "test_data.isna().sum()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2023-12-09T03:06:38.689673Z",
          "iopub.execute_input": "2023-12-09T03:06:38.690104Z",
          "iopub.status.idle": "2023-12-09T03:06:38.701870Z",
          "shell.execute_reply.started": "2023-12-09T03:06:38.690071Z",
          "shell.execute_reply": "2023-12-09T03:06:38.700699Z"
        },
        "trusted": true,
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Gad-bU3qmh6a",
        "outputId": "b91e3e03-8c82-4993-9f35-4c98313a07eb"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "textID       0\n",
              "text         0\n",
              "sentiment    0\n",
              "dtype: int64"
            ]
          },
          "metadata": {},
          "execution_count": 28
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Drop null values\n",
        "test_data.dropna(inplace=True)"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2023-12-09T03:06:50.905069Z",
          "iopub.execute_input": "2023-12-09T03:06:50.906112Z",
          "iopub.status.idle": "2023-12-09T03:06:50.913867Z",
          "shell.execute_reply.started": "2023-12-09T03:06:50.906067Z",
          "shell.execute_reply": "2023-12-09T03:06:50.912656Z"
        },
        "trusted": true,
        "id": "yD8fZWvnmh6a"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Count seniment values\n",
        "train_data['sentiment'].value_counts()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2023-12-09T03:14:49.588524Z",
          "iopub.execute_input": "2023-12-09T03:14:49.589545Z",
          "iopub.status.idle": "2023-12-09T03:14:49.604619Z",
          "shell.execute_reply.started": "2023-12-09T03:14:49.589491Z",
          "shell.execute_reply": "2023-12-09T03:14:49.603145Z"
        },
        "trusted": true,
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "tIX5rua1mh6b",
        "outputId": "d7611ee3-cd43-4e04-d405-7dbf3a8b6fcf"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "neutral     11117\n",
              "positive     8582\n",
              "negative     7781\n",
              "Name: sentiment, dtype: int64"
            ]
          },
          "metadata": {},
          "execution_count": 30
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Analysis text length\n",
        "train_data['text_length'] = train_data['text'].apply(len)\n",
        "train_data.groupby('sentiment')['text_length'].mean()"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2023-12-09T03:06:53.491565Z",
          "iopub.execute_input": "2023-12-09T03:06:53.492804Z",
          "iopub.status.idle": "2023-12-09T03:06:53.531364Z",
          "shell.execute_reply.started": "2023-12-09T03:06:53.492759Z",
          "shell.execute_reply": "2023-12-09T03:06:53.530181Z"
        },
        "trusted": true,
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gvJ2x3Rtmh6a",
        "outputId": "3aff7c0a-44fe-4ca8-80c3-24d8cfd6867a"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "sentiment\n",
              "negative    70.488112\n",
              "neutral     65.206800\n",
              "positive    70.419133\n",
              "Name: text_length, dtype: float64"
            ]
          },
          "metadata": {},
          "execution_count": 31
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Analysis selected text length\n",
        "train_data['selected_text_length'] = train_data['selected_text'].apply(len)\n",
        "train_data.groupby('sentiment')['selected_text_length'].mean()\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YMMpNaeUnRag",
        "outputId": "a551d4d5-ec7a-42a6-9814-63f4a4b08567"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "sentiment\n",
              "negative    19.970698\n",
              "neutral     62.765134\n",
              "positive    18.124680\n",
              "Name: selected_text_length, dtype: float64"
            ]
          },
          "metadata": {},
          "execution_count": 32
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# ANOVA results analysis\n",
        "from scipy import stats\n",
        "\n",
        "f_val, p_val = stats.f_oneway(train_data[train_data['sentiment'] == 'positive']['text_length'],\n",
        "                              train_data[train_data['sentiment'] == 'negative']['text_length'],\n",
        "                              train_data[train_data['sentiment'] == 'neutral']['text_length'])\n",
        "\n",
        "print(\"ANOVA Test Results:\")\n",
        "print(f\"F-statistic: {f_val}\")\n",
        "print(f\"P-value: {p_val}\")\n",
        "\n",
        "# Interpret the results\n",
        "alpha = 0.05\n",
        "if p_val<alpha:\n",
        "     print(\"The means of at least two groups are significantly different.\")\n",
        "else:\n",
        "    print(\"There is no significant difference in the means of the groups.\")"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2023-12-09T03:17:23.632685Z",
          "iopub.execute_input": "2023-12-09T03:17:23.633161Z",
          "iopub.status.idle": "2023-12-09T03:18:41.566408Z",
          "shell.execute_reply.started": "2023-12-09T03:17:23.633128Z",
          "shell.execute_reply": "2023-12-09T03:18:41.564637Z"
        },
        "trusted": true,
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "8u8vnTYrmh6b",
        "outputId": "59af0801-38f3-4f6f-e7d6-3e6ef11982bc"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "ANOVA Test Results:\n",
            "F-statistic: 72.2127709711816\n",
            "P-value: 5.254438748898152e-32\n",
            "The means of at least two groups are significantly different.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Data Processing"
      ],
      "metadata": {
        "id": "j5gAPl2Umh6b"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import re\n",
        "import string\n",
        "\n",
        "def clean_text(text):\n",
        "  text = text.lower()\n",
        "  text = re.sub(r\"what's\", \"what is \", text)\n",
        "  text = re.sub(r\"\\'s\", \" \", text)\n",
        "  text = re.sub(r\"\\'ve\", \" have \", text)\n",
        "  text = re.sub(r\"can't\", \"cannot \", text)\n",
        "  text = re.sub(r\"n't\", \" not \", text)\n",
        "  return text\n",
        "\n",
        "train_data['text'] = train_data['text'].apply(clean_text)\n",
        "test_data['text'] = test_data['text'].apply(clean_text)"
      ],
      "metadata": {
        "id": "BphYjewqpCaL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from tensorflow.keras.preprocessing.text import Tokenizer\n",
        "\n",
        "# Tokenize the text\n",
        "token = Tokenizer(num_words = 500)\n",
        "token.fit_on_texts(train_data['text'])\n",
        "\n",
        "# Convert texts to sequence of integers\n",
        "train_sequences = token.texts_to_sequences(train_data['text'])\n",
        "test_sequences = token.texts_to_sequences(test_data['text'])\n",
        "\n",
        "# Convert labels to categorical one-hot encoding\n",
        "train_labels = pd.get_dummies(train_data['sentiment']).values\n",
        "test_labels = pd.get_dummies(test_data['sentiment']).values\n"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2023-12-09T03:21:59.215359Z",
          "iopub.execute_input": "2023-12-09T03:21:59.215864Z",
          "iopub.status.idle": "2023-12-09T03:22:00.003619Z",
          "shell.execute_reply.started": "2023-12-09T03:21:59.215829Z",
          "shell.execute_reply": "2023-12-09T03:22:00.001594Z"
        },
        "trusted": true,
        "id": "1iM34W3Kmh6b"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Vectorize the text\n",
        "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
        "\n",
        "max_len = max([len(x) for x in train_sequences])\n",
        "train_padded = pad_sequences(train_sequences, maxlen = max_len, padding = \"post\", truncating = \"post\")\n",
        "test_padded = pad_sequences(test_sequences, maxlen = max_len, padding = \"post\", truncating = \"post\")\n",
        "\n",
        "print(max_len)\n",
        "print(train_padded.shape)\n",
        "print(test_padded.shape)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "46S4ueo3mh6b",
        "outputId": "7a345779-d3ea-4656-fdd4-65d819a0ffa2"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "32\n",
            "(27480, 32)\n",
            "(3534, 32)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Build the model"
      ],
      "metadata": {
        "id": "_B7z4_rmsNbJ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from tensorflow.keras.models import Sequential\n",
        "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
        "from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout\n",
        "from tensorflow.keras.callbacks import EarlyStopping\n",
        "from tensorflow.keras.preprocessing.text import Tokenizer\n",
        "from tensorflow.keras.utils import to_categorical\n",
        "import numpy as np\n",
        "import tensorflow as tf"
      ],
      "metadata": {
        "execution": {
          "iopub.status.busy": "2023-12-08T08:48:28.404499Z",
          "iopub.execute_input": "2023-12-08T08:48:28.404989Z",
          "iopub.status.idle": "2023-12-08T08:48:28.756170Z",
          "shell.execute_reply.started": "2023-12-08T08:48:28.404953Z",
          "shell.execute_reply": "2023-12-08T08:48:28.754753Z"
        },
        "trusted": true,
        "id": "vzFXiRI5mh6b"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "np.random.seed(42)\n",
        "tf.random.set_seed(42)"
      ],
      "metadata": {
        "id": "rmY2dSgAI6wL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def lstm_model(max_len):\n",
        "  model = Sequential()\n",
        "\n",
        "  # Add embedding layer\n",
        "  model.add(Embedding(input_dim = 500, output_dim = 32, input_length = max_len))\n",
        "\n",
        "  # Add LSTM layer\n",
        "  model.add(LSTM(64, return_sequences=True))\n",
        "  model.add(Dropout(0.5))\n",
        "\n",
        "  # Add LSTM layer without returning sequences\n",
        "  model.add(LSTM(32))\n",
        "  model.add(Dropout(0.5))\n",
        "\n",
        "  # Add dense layer\n",
        "  model.add(Dense(3, activation='sigmoid'))\n",
        "\n",
        "  return model\n",
        "\n",
        "def gru_model(max_len):\n",
        "  model = Sequential()\n",
        "\n",
        "  # Add embedding layer\n",
        "  model.add(Embedding(input_dim = 500, output_dim = 32, input_length = max_len))\n",
        "  # Add GRU layer\n",
        "  model.add(GRU(128, return_sequences=True))\n",
        "  model.add(Dropout(0.5))\n",
        "\n",
        "  # Add GRU layer without returning sequences\n",
        "  model.add(GRU(32))\n",
        "  model.add(Dropout(0.5))\n",
        "\n",
        "  # Add dense layer\n",
        "  model.add(Dense(3, activation='sigmoid'))\n",
        "\n",
        "  return model"
      ],
      "metadata": {
        "id": "h51X_LSwsSd1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "lstm = lstm_model(max_len)\n",
        "lstm.summary()\n",
        "# Complie the models\n",
        "lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])\n",
        "gru_model(max_len).compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Z2YzFGqu0iIz",
        "outputId": "c3bd49c0-6aba-418d-aac3-8174818b90ca"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Model: \"sequential\"\n",
            "_________________________________________________________________\n",
            " Layer (type)                Output Shape              Param #   \n",
            "=================================================================\n",
            " embedding (Embedding)       (None, 32, 32)            16000     \n",
            "                                                                 \n",
            " lstm (LSTM)                 (None, 32, 64)            24832     \n",
            "                                                                 \n",
            " dropout (Dropout)           (None, 32, 64)            0         \n",
            "                                                                 \n",
            " lstm_1 (LSTM)               (None, 32)                12416     \n",
            "                                                                 \n",
            " dropout_1 (Dropout)         (None, 32)                0         \n",
            "                                                                 \n",
            " dense (Dense)               (None, 3)                 99        \n",
            "                                                                 \n",
            "=================================================================\n",
            "Total params: 53347 (208.39 KB)\n",
            "Trainable params: 53347 (208.39 KB)\n",
            "Non-trainable params: 0 (0.00 Byte)\n",
            "_________________________________________________________________\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "gru = gru_model(max_len)\n",
        "gru.summary()\n",
        "# Complie the models\n",
        "gru.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "m5zd-TmeBb5p",
        "outputId": "0907d4cb-5675-4cbd-e6ea-fc08bb9c6368"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Model: \"sequential_2\"\n",
            "_________________________________________________________________\n",
            " Layer (type)                Output Shape              Param #   \n",
            "=================================================================\n",
            " embedding_2 (Embedding)     (None, 32, 32)            16000     \n",
            "                                                                 \n",
            " gru_2 (GRU)                 (None, 32, 128)           62208     \n",
            "                                                                 \n",
            " dropout_4 (Dropout)         (None, 32, 128)           0         \n",
            "                                                                 \n",
            " gru_3 (GRU)                 (None, 32)                15552     \n",
            "                                                                 \n",
            " dropout_5 (Dropout)         (None, 32)                0         \n",
            "                                                                 \n",
            " dense_2 (Dense)             (None, 3)                 99        \n",
            "                                                                 \n",
            "=================================================================\n",
            "Total params: 93859 (366.64 KB)\n",
            "Trainable params: 93859 (366.64 KB)\n",
            "Non-trainable params: 0 (0.00 Byte)\n",
            "_________________________________________________________________\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Train the models"
      ],
      "metadata": {
        "id": "mSijiLQZ0dk3"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Train the models with early stopping\n",
        "early_stopping = EarlyStopping(patience=3, restore_best_weights=True)"
      ],
      "metadata": {
        "id": "w72HRLgx4e4Q"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "lstm_history = lstm.fit(train_padded, train_labels, epochs=10, validation_split=0.2)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "BxLCU34-4gtj",
        "outputId": "d2ccbde3-5a44-446c-ced1-d1f180276c1d"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Epoch 1/10\n",
            "687/687 [==============================] - 32s 37ms/step - loss: 1.0219 - accuracy: 0.4775 - val_loss: 0.8188 - val_accuracy: 0.6527\n",
            "Epoch 2/10\n",
            "687/687 [==============================] - 23s 34ms/step - loss: 0.8102 - accuracy: 0.6598 - val_loss: 0.7817 - val_accuracy: 0.6652\n",
            "Epoch 3/10\n",
            "687/687 [==============================] - 25s 37ms/step - loss: 0.7768 - accuracy: 0.6731 - val_loss: 0.7642 - val_accuracy: 0.6678\n",
            "Epoch 4/10\n",
            "687/687 [==============================] - 24s 35ms/step - loss: 0.7589 - accuracy: 0.6784 - val_loss: 0.7515 - val_accuracy: 0.6765\n",
            "Epoch 5/10\n",
            "687/687 [==============================] - 23s 34ms/step - loss: 0.7517 - accuracy: 0.6802 - val_loss: 0.7628 - val_accuracy: 0.6752\n",
            "Epoch 6/10\n",
            "687/687 [==============================] - 24s 35ms/step - loss: 0.7412 - accuracy: 0.6848 - val_loss: 0.7506 - val_accuracy: 0.6809\n",
            "Epoch 7/10\n",
            "687/687 [==============================] - 23s 34ms/step - loss: 0.7348 - accuracy: 0.6894 - val_loss: 0.7580 - val_accuracy: 0.6674\n",
            "Epoch 8/10\n",
            "687/687 [==============================] - 23s 34ms/step - loss: 0.7282 - accuracy: 0.6905 - val_loss: 0.7830 - val_accuracy: 0.6590\n",
            "Epoch 9/10\n",
            "687/687 [==============================] - 23s 34ms/step - loss: 0.7207 - accuracy: 0.6943 - val_loss: 0.7708 - val_accuracy: 0.6661\n",
            "Epoch 10/10\n",
            "687/687 [==============================] - 23s 34ms/step - loss: 0.7152 - accuracy: 0.6964 - val_loss: 0.7541 - val_accuracy: 0.6774\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "gru_history= gru.fit(train_padded, train_labels, epochs=10, validation_split=0.2, callbacks=[early_stopping])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "u8js0eVyBut3",
        "outputId": "bf75fabb-ff65-4b7d-d179-c655714fceb8"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Epoch 1/10\n",
            "687/687 [==============================] - 40s 52ms/step - loss: 1.0896 - accuracy: 0.3977 - val_loss: 1.0874 - val_accuracy: 0.4127\n",
            "Epoch 2/10\n",
            "687/687 [==============================] - 38s 55ms/step - loss: 1.0894 - accuracy: 0.4025 - val_loss: 1.0858 - val_accuracy: 0.4127\n",
            "Epoch 3/10\n",
            "687/687 [==============================] - 35s 50ms/step - loss: 1.0880 - accuracy: 0.4026 - val_loss: 1.0870 - val_accuracy: 0.4127\n",
            "Epoch 4/10\n",
            "687/687 [==============================] - 38s 55ms/step - loss: 1.0884 - accuracy: 0.4028 - val_loss: 1.0857 - val_accuracy: 0.4127\n",
            "Epoch 5/10\n",
            "687/687 [==============================] - 35s 51ms/step - loss: 1.0883 - accuracy: 0.4025 - val_loss: 1.0855 - val_accuracy: 0.4127\n",
            "Epoch 6/10\n",
            "687/687 [==============================] - 37s 54ms/step - loss: 0.9598 - accuracy: 0.5236 - val_loss: 0.7822 - val_accuracy: 0.6658\n",
            "Epoch 7/10\n",
            "687/687 [==============================] - 35s 51ms/step - loss: 0.7834 - accuracy: 0.6618 - val_loss: 0.7622 - val_accuracy: 0.6738\n",
            "Epoch 8/10\n",
            "687/687 [==============================] - 37s 54ms/step - loss: 0.7639 - accuracy: 0.6724 - val_loss: 0.7650 - val_accuracy: 0.6721\n",
            "Epoch 9/10\n",
            "687/687 [==============================] - 35s 51ms/step - loss: 0.7415 - accuracy: 0.6831 - val_loss: 0.7536 - val_accuracy: 0.6734\n",
            "Epoch 10/10\n",
            "687/687 [==============================] - 37s 53ms/step - loss: 0.7278 - accuracy: 0.6897 - val_loss: 0.7486 - val_accuracy: 0.6772\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def jaccard(str1, str2):\n",
        "    a = set(str1.lower().split())\n",
        "    b = set(str2.lower().split())\n",
        "    c = a.intersection(b)\n",
        "    return float(len(c)) / (len(a) + len(b) - len(c))\n",
        "\n",
        "def evaluate_model(model, tokenizer, data, true_texts):\n",
        "    \"\"\"\n",
        "    Evaluate the model using the Jaccard score.\n",
        "    - model: The trained model (LSTM or GRU)\n",
        "    - tokenizer: Tokenizer used for the model\n",
        "    - data: The input data for prediction (features)\n",
        "    - true_texts: The true output texts (labels)\n",
        "\n",
        "    Returns the average Jaccard score for the dataset.\n",
        "    \"\"\"\n",
        "    # Generate predictions\n",
        "    predictions = model.predict(data)\n",
        "\n",
        "    pred_texts = [\" \".join(tokenizer.sequences_to_texts([p])) for p in predictions]\n",
        "\n",
        "    # Compute Jaccard scores\n",
        "    scores = [jaccard(pred, true) for pred, true in zip(pred_texts, true_texts)]\n",
        "\n",
        "    # Return the average Jaccard score\n",
        "    return sum(scores) / len(scores)"
      ],
      "metadata": {
        "id": "UGLCcoKzBuDm"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Evaluate on test data\n",
        "test_loss, test_acc = lstm.evaluate(test_padded, test_labels)\n",
        "print(\"Test accuracy:\", test_acc)\n",
        "\n",
        "test_loss, test_acc = gru.evaluate(test_padded, test_labels)\n",
        "print(\"Test accuracy:\", test_acc)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ZVbw5hb6C9Yg",
        "outputId": "179c13e3-64bf-4117-8a44-c0e35978fa87"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "111/111 [==============================] - 2s 14ms/step - loss: 0.7518 - accuracy: 0.6672\n",
            "Test accuracy: 0.6672325730323792\n",
            "111/111 [==============================] - 2s 16ms/step - loss: 0.7480 - accuracy: 0.6692\n",
            "Test accuracy: 0.6692133545875549\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Model accuracy\n",
        "lstm_val_acc = max(lstm_history.history['val_accuracy'])\n",
        "gru_val_acc = max(gru_history.history['val_accuracy'])\n",
        "\n",
        "print(\"LSTM validation accuracy:\", lstm_val_acc)\n",
        "print(\"GRU validation accuracy:\", gru_val_acc)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "fTkR4aemFpXS",
        "outputId": "bf7b8065-4a28-44fb-ffe3-f006b7dd9fdc"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "LSTM validation accuracy: 0.6808587908744812\n",
            "GRU validation accuracy: 0.677219808101654\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Confusion matrix\n",
        "from sklearn.metrics import classification_report, confusion_matrix\n",
        "lstm_pred = lstm.predict(test_padded)\n",
        "gru_pred = gru.predict(test_padded)\n",
        "\n",
        "print(classification_report(test_labels.argmax(axis=1), lstm_pred.argmax(axis=1)))\n",
        "print(classification_report(test_labels.argmax(axis=1), gru_pred.argmax(axis=1)))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Vu1OEfmuMd5S",
        "outputId": "b866ec81-2d9f-440a-9cf5-3eadacc159bc"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "111/111 [==============================] - 2s 11ms/step\n",
            "111/111 [==============================] - 2s 14ms/step\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       0.78      0.46      0.58      1001\n",
            "           1       0.58      0.79      0.67      1430\n",
            "           2       0.76      0.70      0.73      1103\n",
            "\n",
            "    accuracy                           0.67      3534\n",
            "   macro avg       0.71      0.65      0.66      3534\n",
            "weighted avg       0.70      0.67      0.66      3534\n",
            "\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       0.75      0.47      0.58      1001\n",
            "           1       0.58      0.79      0.67      1430\n",
            "           2       0.79      0.69      0.74      1103\n",
            "\n",
            "    accuracy                           0.67      3534\n",
            "   macro avg       0.71      0.65      0.66      3534\n",
            "weighted avg       0.69      0.67      0.67      3534\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "25XNKkNYCQae"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from datetime import datetime\n",
        "\n",
        "def save_model(model, prefix =''):\n",
        "  # Get the current time and filename\n",
        "  current_time = datetime.now().strftime(\"%Y-%m-%d-%H%M%S\")\n",
        "  filename = f\"{prefix}model_{current_time}.csv\"\n",
        "  # Save the models\n",
        "  model.save(filename)\n",
        "  print(f\"Model saved to {filename}\")\n",
        "\n",
        "save_model(lstm, prefix = \"lstm\")\n",
        "save_model(gru, prefix = \"gru\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "LYqnTQa-bdqn",
        "outputId": "e248dd33-fa4a-4c52-88c2-01d216316d18"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Model saved to lstmmodel_2024-03-04-015021.csv\n",
            "Model saved to grumodel_2024-03-04-015029.csv\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install --upgrade kaggle"
      ],
      "metadata": {
        "id": "vgjLVo-l_LO9"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "! kaggle competitions submit -c tweet-sentiment-extraction -f '/content/lstmmodel_2024-03-04-005517.csv' -m late-submission"
      ],
      "metadata": {
        "id": "kmaX09fo9Urz",
        "outputId": "24134994-354d-4d1b-8ac2-9e7ab86e3994",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Traceback (most recent call last):\n",
            "  File \"/usr/local/bin/kaggle\", line 8, in <module>\n",
            "    sys.exit(main())\n",
            "  File \"/usr/local/lib/python3.10/dist-packages/kaggle/cli.py\", line 70, in main\n",
            "    out = args.func(**command_args)\n",
            "  File \"/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py\", line 801, in competition_submit_cli\n",
            "    submit_result = self.competition_submit(file_name, message,\n",
            "  File \"/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py\", line 752, in competition_submit\n",
            "    content_length=os.path.getsize(file_name),\n",
            "  File \"/usr/lib/python3.10/genericpath.py\", line 50, in getsize\n",
            "    return os.stat(filename).st_size\n",
            "FileNotFoundError: [Errno 2] No such file or directory: '/content/lstmmodel_2024-03-04-005517.csv'\n"
          ]
        }
      ]
    }
  ]
}