satyajitghana/sst_dataset_augmentation.ipynb

## sst_dataset_augmentation.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "SST_Dataset_Augmentation.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "toc_visible": true,
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/satyajitghana/f232352e37a9f46205c61a2ae741e321/sst_dataset_augmentation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "mNwTHtsDKavA"
      },
      "source": [
        "# Stanford Sentiment TreeBank Dataset"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "t7udJjj-Ly-i",
        "outputId": "359259d3-487a-45ed-85c5-ab7539568daf"
      },
      "source": [
        "! pip install swifter --quiet"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "\u001b[K     |████████████████████████████████| 634kB 7.8MB/s \n",
            "\u001b[K     |████████████████████████████████| 296kB 12.8MB/s \n",
            "\u001b[K     |████████████████████████████████| 583kB 25.3MB/s \n",
            "\u001b[K     |████████████████████████████████| 48.5MB 78kB/s \n",
            "\u001b[K     |████████████████████████████████| 17.2MB 256kB/s \n",
            "\u001b[K     |████████████████████████████████| 71kB 8.3MB/s \n",
            "\u001b[K     |████████████████████████████████| 204kB 57.6MB/s \n",
            "\u001b[K     |████████████████████████████████| 133kB 51.7MB/s \n",
            "\u001b[K     |████████████████████████████████| 81kB 9.0MB/s \n",
            "\u001b[K     |████████████████████████████████| 81kB 8.1MB/s \n",
            "\u001b[K     |████████████████████████████████| 3.1MB 26.2MB/s \n",
            "\u001b[K     |████████████████████████████████| 92kB 8.6MB/s \n",
            "\u001b[?25h  Building wheel for swifter (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Building wheel for gpustat (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "\u001b[31mERROR: modin 0.9.1 has requirement pandas==1.2.3, but you'll have pandas 1.1.5 which is incompatible.\u001b[0m\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JDoFLfACKYSC"
      },
      "source": [
        "## Get to know RAW Dataset"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "pBGGh-9H6c2l",
        "outputId": "b71a62f1-f2fc-437d-f8dc-9f54b241f379"
      },
      "source": [
        "! wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "--2021-06-03 14:54:32--  http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip\n",
            "Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140\n",
            "Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.\n",
            "HTTP request sent, awaiting response... 302 Found\n",
            "Location: https://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip [following]\n",
            "--2021-06-03 14:54:32--  https://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip\n",
            "Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 6372817 (6.1M) [application/zip]\n",
            "Saving to: ‘stanfordSentimentTreebank.zip’\n",
            "\n",
            "stanfordSentimentTr 100%[===================>]   6.08M  25.2MB/s    in 0.2s    \n",
            "\n",
            "2021-06-03 14:54:32 (25.2 MB/s) - ‘stanfordSentimentTreebank.zip’ saved [6372817/6372817]\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vr-4d0VL6m1Z",
        "outputId": "796627da-b567-4b40-956b-b1e853fec2d0"
      },
      "source": [
        "! unzip stanfordSentimentTreebank.zip"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Archive:  stanfordSentimentTreebank.zip\n",
            "   creating: stanfordSentimentTreebank/\n",
            "  inflating: stanfordSentimentTreebank/datasetSentences.txt  \n",
            "   creating: __MACOSX/\n",
            "   creating: __MACOSX/stanfordSentimentTreebank/\n",
            "  inflating: __MACOSX/stanfordSentimentTreebank/._datasetSentences.txt  \n",
            "  inflating: stanfordSentimentTreebank/datasetSplit.txt  \n",
            "  inflating: __MACOSX/stanfordSentimentTreebank/._datasetSplit.txt  \n",
            "  inflating: stanfordSentimentTreebank/dictionary.txt  \n",
            "  inflating: __MACOSX/stanfordSentimentTreebank/._dictionary.txt  \n",
            "  inflating: stanfordSentimentTreebank/original_rt_snippets.txt  \n",
            "  inflating: __MACOSX/stanfordSentimentTreebank/._original_rt_snippets.txt  \n",
            "  inflating: stanfordSentimentTreebank/README.txt  \n",
            "  inflating: __MACOSX/stanfordSentimentTreebank/._README.txt  \n",
            "  inflating: stanfordSentimentTreebank/sentiment_labels.txt  \n",
            "  inflating: __MACOSX/stanfordSentimentTreebank/._sentiment_labels.txt  \n",
            "  inflating: stanfordSentimentTreebank/SOStr.txt  \n",
            "  inflating: stanfordSentimentTreebank/STree.txt  \n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "hykrz_eo6jEP"
      },
      "source": [
        "import os\n",
        "\n",
        "import pandas as pd\n",
        "from tqdm.auto import tqdm\n",
        "import swifter # this does parallel processing to apply function\n",
        "\n",
        "tqdm.pandas()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "K6KYG1MZ7RDT"
      },
      "source": [
        "sst_dir = 'stanfordSentimentTreebank'"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jsA_Tbi-6wnl"
      },
      "source": [
        "sentiment_labels = pd.read_csv(os.path.join(sst_dir, \"sentiment_labels.txt\"), names=['phrase_ids', 'sentiment_values'], sep=\"|\", header=0)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "L8BsfZlCJZPV"
      },
      "source": [
        "This is phrase ids and their corresponding sentiment values"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 195
        },
        "id": "31iuctus7XuA",
        "outputId": "b9b3ff51-87df-469f-cb09-30b9834f7d70"
      },
      "source": [
        "sentiment_labels.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>phrase_ids</th>\n",
              "      <th>sentiment_values</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "      <td>0.50000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>0.50000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2</td>\n",
              "      <td>0.44444</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>3</td>\n",
              "      <td>0.50000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>4</td>\n",
              "      <td>0.42708</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   phrase_ids  sentiment_values\n",
              "0           0           0.50000\n",
              "1           1           0.50000\n",
              "2           2           0.44444\n",
              "3           3           0.50000\n",
              "4           4           0.42708"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 13
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CX1TkOhGJdOq"
      },
      "source": [
        "the sentiment_values [0-1] are mapped to 0-4, i.e. 5 values, see the README.txt in the dataset zip file to know more"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2_TxJUzH_n-s"
      },
      "source": [
        "def discretize_label(label):\n",
        "    if label <= 0.2: return 0\n",
        "    if label <= 0.4: return 1\n",
        "    if label <= 0.6: return 2\n",
        "    if label <= 0.8: return 3\n",
        "    return 4"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xUPYflCD_tK0"
      },
      "source": [
        "sentiment_labels['sentiment_values'] = sentiment_labels['sentiment_values'].apply(discretize_label)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 195
        },
        "id": "cBWMAPxY_50P",
        "outputId": "873774fc-19ff-464b-8177-51d04eb2d738"
      },
      "source": [
        "sentiment_labels.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>phrase_ids</th>\n",
              "      <th>sentiment_values</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>3</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>4</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   phrase_ids  sentiment_values\n",
              "0           0                 2\n",
              "1           1                 2\n",
              "2           2                 2\n",
              "3           3                 2\n",
              "4           4                 2"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 16
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bXNcIsSj7cwH"
      },
      "source": [
        "sentence_ids = pd.read_csv(os.path.join(sst_dir, \"datasetSentences.txt\"), sep=\"\\t\")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "IYnbdEVeJocN"
      },
      "source": [
        "This is sentence index and its corresponding sentence"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 195
        },
        "id": "v2crGNOx7d2E",
        "outputId": "aa2a6235-67b2-4b47-f83a-1bb2eb738bbf"
      },
      "source": [
        "sentence_ids.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>sentence_index</th>\n",
              "      <th>sentence</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>The Rock is destined to be the 21st Century 's...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>2</td>\n",
              "      <td>The gorgeously elaborate continuation of `` Th...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>3</td>\n",
              "      <td>Effective but too-tepid biopic</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>4</td>\n",
              "      <td>If you sometimes like to go to the movies to h...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>5</td>\n",
              "      <td>Emerges as something rare , an issue movie tha...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   sentence_index                                           sentence\n",
              "0               1  The Rock is destined to be the 21st Century 's...\n",
              "1               2  The gorgeously elaborate continuation of `` Th...\n",
              "2               3                     Effective but too-tepid biopic\n",
              "3               4  If you sometimes like to go to the movies to h...\n",
              "4               5  Emerges as something rare , an issue movie tha..."
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 18
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zjOqAFYp7hiT"
      },
      "source": [
        "dictionary = pd.read_csv(os.path.join(sst_dir, \"dictionary.txt\"), sep=\"|\", names=['phrase', 'phrase_ids'])"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xes4uBpKJspy"
      },
      "source": [
        "This dictionary is the mapping of phrases to their phrase_ids"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 195
        },
        "id": "2A9yaLg87ifW",
        "outputId": "bd08a33d-5995-4389-de34-718d4f595ea8"
      },
      "source": [
        "dictionary.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>phrase</th>\n",
              "      <th>phrase_ids</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>!</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>! '</td>\n",
              "      <td>22935</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>! ''</td>\n",
              "      <td>18235</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>! Alas</td>\n",
              "      <td>179257</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>! Brilliant</td>\n",
              "      <td>22936</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "        phrase  phrase_ids\n",
              "0            !           0\n",
              "1          ! '       22935\n",
              "2         ! ''       18235\n",
              "3       ! Alas      179257\n",
              "4  ! Brilliant       22936"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 20
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OGY66We2Jwu9"
      },
      "source": [
        "This is the train, dev, test split\n",
        "\n",
        "here, 1 = train, 2 = dev, 3 = test"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7eyow7Jp7k-C"
      },
      "source": [
        "train_test_split = pd.read_csv(os.path.join(sst_dir, \"datasetSplit.txt\"))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 195
        },
        "id": "sg2TEkZi7l_h",
        "outputId": "589b448f-c918-4910-d72d-6834470f8a37"
      },
      "source": [
        "train_test_split.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>sentence_index</th>\n",
              "      <th>splitset_label</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>2</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>3</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>4</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>5</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   sentence_index  splitset_label\n",
              "0               1               1\n",
              "1               2               1\n",
              "2               3               2\n",
              "3               4               2\n",
              "4               5               2"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 22
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4ooVS-fwJ5q2"
      },
      "source": [
        "sentence_ids = sentence_index, sentence\n",
        "dictionary = phrase, phrase_ids\n",
        "\n",
        "so what we do is \"join\" the two above, using phrase and sentence as the key, this is an inner join, similar to an SQL inner join, basically we'll get only those phrases where **sentence == phrase**, and now we will have sentence, phrase, phrase_ids\n",
        "\n",
        "note now that we have phrase_ids, and also we know the sentiment_value for each of the phrase_ids, we perform another join, this time using phrase_ids as the key.\n",
        "\n",
        "NOTE: below i first join them with the train_test_split, followed by the sentiment_labels, because the train_test_split in on sentence_index. you can do sentiment_lablels first also and the train_test_split join later, doesn't matter.\n",
        "\n",
        "**Its best to run these join one by one and see the results**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MeNZoW0p70bL"
      },
      "source": [
        "sentence_phrase_merge = pd.merge(sentence_ids, dictionary, left_on='sentence', right_on='phrase')\n",
        "sentence_phrase_split = pd.merge(sentence_phrase_merge, train_test_split, on='sentence_index')\n",
        "dataset = pd.merge(sentence_phrase_split, sentiment_labels, on='phrase_ids')"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xTWYox_XXHjd"
      },
      "source": [
        "dataset['phrase_cleaned'] = dataset['sentence'].str.replace(r\"\\s('s|'d|'re|'ll|'m|'ve|n't)\\b\", lambda m: m.group(1))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 262
        },
        "id": "CypFM4Pf6vCB",
        "outputId": "f731d79d-dde5-4e03-a830-39cf20b239eb"
      },
      "source": [
        "dataset.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>sentence_index</th>\n",
              "      <th>sentence</th>\n",
              "      <th>phrase</th>\n",
              "      <th>phrase_ids</th>\n",
              "      <th>splitset_label</th>\n",
              "      <th>sentiment_values</th>\n",
              "      <th>phrase_cleaned</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>The Rock is destined to be the 21st Century 's...</td>\n",
              "      <td>The Rock is destined to be the 21st Century 's...</td>\n",
              "      <td>226166</td>\n",
              "      <td>1</td>\n",
              "      <td>3</td>\n",
              "      <td>The Rock is destined to be the 21st Century's ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>2</td>\n",
              "      <td>The gorgeously elaborate continuation of `` Th...</td>\n",
              "      <td>The gorgeously elaborate continuation of `` Th...</td>\n",
              "      <td>226300</td>\n",
              "      <td>1</td>\n",
              "      <td>4</td>\n",
              "      <td>The gorgeously elaborate continuation of `` Th...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>3</td>\n",
              "      <td>Effective but too-tepid biopic</td>\n",
              "      <td>Effective but too-tepid biopic</td>\n",
              "      <td>13995</td>\n",
              "      <td>2</td>\n",
              "      <td>2</td>\n",
              "      <td>Effective but too-tepid biopic</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>4</td>\n",
              "      <td>If you sometimes like to go to the movies to h...</td>\n",
              "      <td>If you sometimes like to go to the movies to h...</td>\n",
              "      <td>14123</td>\n",
              "      <td>2</td>\n",
              "      <td>3</td>\n",
              "      <td>If you sometimes like to go to the movies to h...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>5</td>\n",
              "      <td>Emerges as something rare , an issue movie tha...</td>\n",
              "      <td>Emerges as something rare , an issue movie tha...</td>\n",
              "      <td>13999</td>\n",
              "      <td>2</td>\n",
              "      <td>4</td>\n",
              "      <td>Emerges as something rare , an issue movie tha...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   sentence_index  ...                                     phrase_cleaned\n",
              "0               1  ...  The Rock is destined to be the 21st Century's ...\n",
              "1               2  ...  The gorgeously elaborate continuation of `` Th...\n",
              "2               3  ...                     Effective but too-tepid biopic\n",
              "3               4  ...  If you sometimes like to go to the movies to h...\n",
              "4               5  ...  Emerges as something rare , an issue movie tha...\n",
              "\n",
              "[5 rows x 7 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 33
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "d-SFEbgW75P0",
        "outputId": "7fcb6e0c-b2e8-467c-ffa3-56f16e20d110"
      },
      "source": [
        "dataset.iloc[100]"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "sentence_index                                                    101\n",
              "sentence            If nothing else , this movie introduces a prom...\n",
              "phrase              If nothing else , this movie introduces a prom...\n",
              "phrase_ids                                                      14114\n",
              "splitset_label                                                      2\n",
              "sentiment_values                                                    3\n",
              "phrase_cleaned      If nothing else , this movie introduces a prom...\n",
              "Name: 100, dtype: object"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 37
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 284
        },
        "id": "tMixwrUy7_hm",
        "outputId": "dd48fa25-11a8-404f-de65-816f73c56adb"
      },
      "source": [
        "dataset.describe()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>sentence_index</th>\n",
              "      <th>phrase_ids</th>\n",
              "      <th>splitset_label</th>\n",
              "      <th>sentiment_values</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>11286.000000</td>\n",
              "      <td>11286.000000</td>\n",
              "      <td>11286.000000</td>\n",
              "      <td>11286.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>mean</th>\n",
              "      <td>5910.961102</td>\n",
              "      <td>132003.589846</td>\n",
              "      <td>1.373294</td>\n",
              "      <td>2.059986</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>std</th>\n",
              "      <td>3422.455572</td>\n",
              "      <td>68214.626430</td>\n",
              "      <td>0.647295</td>\n",
              "      <td>1.287835</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>min</th>\n",
              "      <td>1.000000</td>\n",
              "      <td>3467.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>25%</th>\n",
              "      <td>2951.250000</td>\n",
              "      <td>67402.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>1.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>50%</th>\n",
              "      <td>5904.500000</td>\n",
              "      <td>144063.500000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>2.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>75%</th>\n",
              "      <td>8865.750000</td>\n",
              "      <td>188139.750000</td>\n",
              "      <td>2.000000</td>\n",
              "      <td>3.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>max</th>\n",
              "      <td>11855.000000</td>\n",
              "      <td>238977.000000</td>\n",
              "      <td>3.000000</td>\n",
              "      <td>4.000000</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "       sentence_index     phrase_ids  splitset_label  sentiment_values\n",
              "count    11286.000000   11286.000000    11286.000000      11286.000000\n",
              "mean      5910.961102  132003.589846        1.373294          2.059986\n",
              "std       3422.455572   68214.626430        0.647295          1.287835\n",
              "min          1.000000    3467.000000        1.000000          0.000000\n",
              "25%       2951.250000   67402.000000        1.000000          1.000000\n",
              "50%       5904.500000  144063.500000        1.000000          2.000000\n",
              "75%       8865.750000  188139.750000        2.000000          3.000000\n",
              "max      11855.000000  238977.000000        3.000000          4.000000"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 38
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "VtB2CSpNBHGO",
        "outputId": "3fefb05f-6551-4ac3-a4ea-6cd46ca12654"
      },
      "source": [
        "dataset.info()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "<class 'pandas.core.frame.DataFrame'>\n",
            "Int64Index: 11286 entries, 0 to 11285\n",
            "Data columns (total 7 columns):\n",
            " #   Column            Non-Null Count  Dtype \n",
            "---  ------            --------------  ----- \n",
            " 0   sentence_index    11286 non-null  int64 \n",
            " 1   sentence          11286 non-null  object\n",
            " 2   phrase            11286 non-null  object\n",
            " 3   phrase_ids        11286 non-null  int64 \n",
            " 4   splitset_label    11286 non-null  int64 \n",
            " 5   sentiment_values  11286 non-null  int64 \n",
            " 6   phrase_cleaned    11286 non-null  object\n",
            "dtypes: int64(4), object(3)\n",
            "memory usage: 705.4+ KB\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bKs8WE_SYdFr"
      },
      "source": [
        "dataset.to_csv('sst_dataset_cleaned.csv')"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}