Last active
March 8, 2024 04:39
-
-
Save ZhiyaoShu/bba0a711f2f4b8f368c453dc9ae07641 to your computer and use it in GitHub Desktop.
Tweet Sentiment Extraction NLP.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"kernelspec": { | |
"language": "python", | |
"display_name": "Python 3", | |
"name": "python3" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.10.12", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"kaggle": { | |
"accelerator": "none", | |
"dataSources": [ | |
{ | |
"sourceId": 16295, | |
"databundleVersionId": 1099992, | |
"sourceType": "competition" | |
} | |
], | |
"dockerImageVersionId": 30615, | |
"isInternetEnabled": true, | |
"language": "python", | |
"sourceType": "notebook", | |
"isGpuEnabled": false | |
}, | |
"colab": { | |
"provenance": [], | |
"include_colab_link": true | |
} | |
}, | |
"nbformat_minor": 0, | |
"nbformat": 4, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/ZhiyaoShu/bba0a711f2f4b8f368c453dc9ae07641/tweet_sentiment_extraction_nlp.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install -q kaggle" | |
], | |
"metadata": { | |
"id": "tEsX1rtj0svh" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/" | |
], | |
"metadata": { | |
"id": "IEX4Y7n81pbZ" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"! chmod 600 ~/.kaggle/kaggle.json" | |
], | |
"metadata": { | |
"id": "qeVgGSVS3iDU" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"! kaggle datasets list" | |
], | |
"metadata": { | |
"id": "v_K6hdnu3jwB" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"\n", | |
"import os\n", | |
"for dirname, _, filenames in os.walk('/kaggle/input'):\n", | |
" for filename in filenames:\n", | |
" print(os.path.join(dirname, filename))\n" | |
], | |
"metadata": { | |
"_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", | |
"_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", | |
"execution": { | |
"iopub.status.busy": "2023-12-09T03:03:55.911081Z", | |
"iopub.execute_input": "2023-12-09T03:03:55.912258Z", | |
"iopub.status.idle": "2023-12-09T03:03:56.443447Z", | |
"shell.execute_reply.started": "2023-12-09T03:03:55.912211Z", | |
"shell.execute_reply": "2023-12-09T03:03:56.441627Z" | |
}, | |
"trusted": true, | |
"id": "BCLtIvzkmh6U" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from google.colab import drive\n", | |
"drive.mount('/content/drive')" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "dMnUsKzuHHgq", | |
"outputId": "9b1f2c09-f65d-4f84-b802-0b4938a88ef4" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Mounted at /content/drive\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Intro: This is a practice of social media sentiment extraction" | |
], | |
"metadata": { | |
"id": "dukkVz6omh6X" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import tensorflow as tf\n", | |
"print(tf.__version__)" | |
], | |
"metadata": { | |
"execution": { | |
"iopub.status.busy": "2023-12-09T03:05:31.877064Z", | |
"iopub.execute_input": "2023-12-09T03:05:31.877820Z", | |
"iopub.status.idle": "2023-12-09T03:05:51.675825Z", | |
"shell.execute_reply.started": "2023-12-09T03:05:31.877784Z", | |
"shell.execute_reply": "2023-12-09T03:05:51.674840Z" | |
}, | |
"trusted": true, | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "wmxvLE-ymh6Y", | |
"outputId": "08cc362d-d7eb-466c-d359-982e77055a2f" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"2.15.0\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"! kaggle competitions download -c tweet-sentiment-extraction" | |
], | |
"metadata": { | |
"id": "gPNHxpMH3rXS", | |
"outputId": "23771922-368c-4770-fb45-c3149f4af9fd", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Downloading tweet-sentiment-extraction.zip to /content\n", | |
"\r 0% 0.00/1.39M [00:00<?, ?B/s]\n", | |
"\r100% 1.39M/1.39M [00:00<00:00, 133MB/s]\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"! unzip tweet-sentiment-extraction.zip" | |
], | |
"metadata": { | |
"id": "ENI-yrZi4CUt", | |
"outputId": "d0c64924-1e65-4749-e858-e898fa75ec7b", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Archive: tweet-sentiment-extraction.zip\n", | |
" inflating: sample_submission.csv \n", | |
" inflating: test.csv \n", | |
" inflating: train.csv \n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"test_data = pd.read_csv(\"/content/test.csv\")\n", | |
"train_data = pd.read_csv(\"/content/train.csv\")\n", | |
"\n", | |
"test_data.head()\n", | |
"train_data.head()" | |
], | |
"metadata": { | |
"execution": { | |
"iopub.status.busy": "2023-12-09T03:05:51.887897Z", | |
"iopub.execute_input": "2023-12-09T03:05:51.888351Z", | |
"iopub.status.idle": "2023-12-09T03:05:52.026869Z", | |
"shell.execute_reply.started": "2023-12-09T03:05:51.888316Z", | |
"shell.execute_reply": "2023-12-09T03:05:52.025505Z" | |
}, | |
"trusted": true, | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 258 | |
}, | |
"id": "yfvBvkeGmh6Y", | |
"outputId": "eb8b2cf9-1e40-4aba-c928-ef449a93cb73" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
" textID text \\\n", | |
"0 cb774db0d1 I`d have responded, if I were going \n", | |
"1 549e992a42 Sooo SAD I will miss you here in San Diego!!! \n", | |
"2 088c60f138 my boss is bullying me... \n", | |
"3 9642c003ef what interview! leave me alone \n", | |
"4 358bd9e861 Sons of ****, why couldn`t they put them on t... \n", | |
"\n", | |
" selected_text sentiment \n", | |
"0 I`d have responded, if I were going neutral \n", | |
"1 Sooo SAD negative \n", | |
"2 bullying me negative \n", | |
"3 leave me alone negative \n", | |
"4 Sons of ****, negative " | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-7b4501f3-e383-4523-881d-b8fc1dd19732\" class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>textID</th>\n", | |
" <th>text</th>\n", | |
" <th>selected_text</th>\n", | |
" <th>sentiment</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>cb774db0d1</td>\n", | |
" <td>I`d have responded, if I were going</td>\n", | |
" <td>I`d have responded, if I were going</td>\n", | |
" <td>neutral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>549e992a42</td>\n", | |
" <td>Sooo SAD I will miss you here in San Diego!!!</td>\n", | |
" <td>Sooo SAD</td>\n", | |
" <td>negative</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>088c60f138</td>\n", | |
" <td>my boss is bullying me...</td>\n", | |
" <td>bullying me</td>\n", | |
" <td>negative</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>9642c003ef</td>\n", | |
" <td>what interview! leave me alone</td>\n", | |
" <td>leave me alone</td>\n", | |
" <td>negative</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>358bd9e861</td>\n", | |
" <td>Sons of ****, why couldn`t they put them on t...</td>\n", | |
" <td>Sons of ****,</td>\n", | |
" <td>negative</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <div class=\"colab-df-buttons\">\n", | |
"\n", | |
" <div class=\"colab-df-container\">\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-7b4501f3-e383-4523-881d-b8fc1dd19732')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
"\n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n", | |
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
"\n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" .colab-df-buttons div {\n", | |
" margin-bottom: 4px;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-7b4501f3-e383-4523-881d-b8fc1dd19732 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-7b4501f3-e383-4523-881d-b8fc1dd19732');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
"\n", | |
"\n", | |
"<div id=\"df-7f93d536-0713-47f8-a2bd-a22ba0017957\">\n", | |
" <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-7f93d536-0713-47f8-a2bd-a22ba0017957')\"\n", | |
" title=\"Suggest charts\"\n", | |
" style=\"display:none;\">\n", | |
"\n", | |
"<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <g>\n", | |
" <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n", | |
" </g>\n", | |
"</svg>\n", | |
" </button>\n", | |
"\n", | |
"<style>\n", | |
" .colab-df-quickchart {\n", | |
" --bg-color: #E8F0FE;\n", | |
" --fill-color: #1967D2;\n", | |
" --hover-bg-color: #E2EBFA;\n", | |
" --hover-fill-color: #174EA6;\n", | |
" --disabled-fill-color: #AAA;\n", | |
" --disabled-bg-color: #DDD;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-quickchart {\n", | |
" --bg-color: #3B4455;\n", | |
" --fill-color: #D2E3FC;\n", | |
" --hover-bg-color: #434B5C;\n", | |
" --hover-fill-color: #FFFFFF;\n", | |
" --disabled-bg-color: #3B4455;\n", | |
" --disabled-fill-color: #666;\n", | |
" }\n", | |
"\n", | |
" .colab-df-quickchart {\n", | |
" background-color: var(--bg-color);\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: var(--fill-color);\n", | |
" height: 32px;\n", | |
" padding: 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-quickchart:hover {\n", | |
" background-color: var(--hover-bg-color);\n", | |
" box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: var(--button-hover-fill-color);\n", | |
" }\n", | |
"\n", | |
" .colab-df-quickchart-complete:disabled,\n", | |
" .colab-df-quickchart-complete:disabled:hover {\n", | |
" background-color: var(--disabled-bg-color);\n", | |
" fill: var(--disabled-fill-color);\n", | |
" box-shadow: none;\n", | |
" }\n", | |
"\n", | |
" .colab-df-spinner {\n", | |
" border: 2px solid var(--fill-color);\n", | |
" border-color: transparent;\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" animation:\n", | |
" spin 1s steps(1) infinite;\n", | |
" }\n", | |
"\n", | |
" @keyframes spin {\n", | |
" 0% {\n", | |
" border-color: transparent;\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" border-left-color: var(--fill-color);\n", | |
" }\n", | |
" 20% {\n", | |
" border-color: transparent;\n", | |
" border-left-color: var(--fill-color);\n", | |
" border-top-color: var(--fill-color);\n", | |
" }\n", | |
" 30% {\n", | |
" border-color: transparent;\n", | |
" border-left-color: var(--fill-color);\n", | |
" border-top-color: var(--fill-color);\n", | |
" border-right-color: var(--fill-color);\n", | |
" }\n", | |
" 40% {\n", | |
" border-color: transparent;\n", | |
" border-right-color: var(--fill-color);\n", | |
" border-top-color: var(--fill-color);\n", | |
" }\n", | |
" 60% {\n", | |
" border-color: transparent;\n", | |
" border-right-color: var(--fill-color);\n", | |
" }\n", | |
" 80% {\n", | |
" border-color: transparent;\n", | |
" border-right-color: var(--fill-color);\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" }\n", | |
" 90% {\n", | |
" border-color: transparent;\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" }\n", | |
" }\n", | |
"</style>\n", | |
"\n", | |
" <script>\n", | |
" async function quickchart(key) {\n", | |
" const quickchartButtonEl =\n", | |
" document.querySelector('#' + key + ' button');\n", | |
" quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n", | |
" quickchartButtonEl.classList.add('colab-df-spinner');\n", | |
" try {\n", | |
" const charts = await google.colab.kernel.invokeFunction(\n", | |
" 'suggestCharts', [key], {});\n", | |
" } catch (error) {\n", | |
" console.error('Error during call to suggestCharts:', error);\n", | |
" }\n", | |
" quickchartButtonEl.classList.remove('colab-df-spinner');\n", | |
" quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n", | |
" }\n", | |
" (() => {\n", | |
" let quickchartButtonEl =\n", | |
" document.querySelector('#df-7f93d536-0713-47f8-a2bd-a22ba0017957 button');\n", | |
" quickchartButtonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
" })();\n", | |
" </script>\n", | |
"</div>\n", | |
" </div>\n", | |
" </div>\n" | |
], | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "dataframe", | |
"variable_name": "train_data", | |
"summary": "{\n \"name\": \"train_data\",\n \"rows\": 27481,\n \"fields\": [\n {\n \"column\": \"textID\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 27481,\n \"samples\": [\n \"a7f72a928a\",\n \"ef42dee96c\",\n \"07d17131b1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 27480,\n \"samples\": [\n \" Enjoy! Family trumps everything\",\n \" --of them kinda turns me off of it all. And then I buy more of them and dig a deeper hole, etc. ;;\",\n \"Clive it`s my birthday pat me http://apps.facebook.com/dogbook/profile/view/6386106\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"selected_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 22463,\n \"samples\": [\n \"we win\",\n \"YES!!! haahaaa.! break out the jellybeaniesss!\",\n \"hay wats ur AIM? we should chat\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" | |
} | |
}, | |
"metadata": {}, | |
"execution_count": 23 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Check basic information\n", | |
"train_data.describe()" | |
], | |
"metadata": { | |
"execution": { | |
"iopub.status.busy": "2023-12-09T03:06:18.849022Z", | |
"iopub.execute_input": "2023-12-09T03:06:18.849502Z", | |
"iopub.status.idle": "2023-12-09T03:06:18.943085Z", | |
"shell.execute_reply.started": "2023-12-09T03:06:18.849465Z", | |
"shell.execute_reply": "2023-12-09T03:06:18.941595Z" | |
}, | |
"trusted": true, | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 175 | |
}, | |
"id": "LGQvMS65mh6Y", | |
"outputId": "e2ef44af-d069-4bb1-99f2-563bc35ec443" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
" textID text selected_text \\\n", | |
"count 27481 27480 27480 \n", | |
"unique 27481 27480 22463 \n", | |
"top cb774db0d1 I`d have responded, if I were going good \n", | |
"freq 1 1 199 \n", | |
"\n", | |
" sentiment \n", | |
"count 27481 \n", | |
"unique 3 \n", | |
"top neutral \n", | |
"freq 11118 " | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-ad0427ca-cb16-43e0-82bb-b17a3e70e007\" class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>textID</th>\n", | |
" <th>text</th>\n", | |
" <th>selected_text</th>\n", | |
" <th>sentiment</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>27481</td>\n", | |
" <td>27480</td>\n", | |
" <td>27480</td>\n", | |
" <td>27481</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>unique</th>\n", | |
" <td>27481</td>\n", | |
" <td>27480</td>\n", | |
" <td>22463</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>top</th>\n", | |
" <td>cb774db0d1</td>\n", | |
" <td>I`d have responded, if I were going</td>\n", | |
" <td>good</td>\n", | |
" <td>neutral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>freq</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>199</td>\n", | |
" <td>11118</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <div class=\"colab-df-buttons\">\n", | |
"\n", | |
" <div class=\"colab-df-container\">\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-ad0427ca-cb16-43e0-82bb-b17a3e70e007')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
"\n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n", | |
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
"\n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" .colab-df-buttons div {\n", | |
" margin-bottom: 4px;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-ad0427ca-cb16-43e0-82bb-b17a3e70e007 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-ad0427ca-cb16-43e0-82bb-b17a3e70e007');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
"\n", | |
"\n", | |
"<div id=\"df-eb23c673-c67d-4a2b-ab0e-fae860d855eb\">\n", | |
" <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-eb23c673-c67d-4a2b-ab0e-fae860d855eb')\"\n", | |
" title=\"Suggest charts\"\n", | |
" style=\"display:none;\">\n", | |
"\n", | |
"<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <g>\n", | |
" <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n", | |
" </g>\n", | |
"</svg>\n", | |
" </button>\n", | |
"\n", | |
"<style>\n", | |
" .colab-df-quickchart {\n", | |
" --bg-color: #E8F0FE;\n", | |
" --fill-color: #1967D2;\n", | |
" --hover-bg-color: #E2EBFA;\n", | |
" --hover-fill-color: #174EA6;\n", | |
" --disabled-fill-color: #AAA;\n", | |
" --disabled-bg-color: #DDD;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-quickchart {\n", | |
" --bg-color: #3B4455;\n", | |
" --fill-color: #D2E3FC;\n", | |
" --hover-bg-color: #434B5C;\n", | |
" --hover-fill-color: #FFFFFF;\n", | |
" --disabled-bg-color: #3B4455;\n", | |
" --disabled-fill-color: #666;\n", | |
" }\n", | |
"\n", | |
" .colab-df-quickchart {\n", | |
" background-color: var(--bg-color);\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: var(--fill-color);\n", | |
" height: 32px;\n", | |
" padding: 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-quickchart:hover {\n", | |
" background-color: var(--hover-bg-color);\n", | |
" box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: var(--button-hover-fill-color);\n", | |
" }\n", | |
"\n", | |
" .colab-df-quickchart-complete:disabled,\n", | |
" .colab-df-quickchart-complete:disabled:hover {\n", | |
" background-color: var(--disabled-bg-color);\n", | |
" fill: var(--disabled-fill-color);\n", | |
" box-shadow: none;\n", | |
" }\n", | |
"\n", | |
" .colab-df-spinner {\n", | |
" border: 2px solid var(--fill-color);\n", | |
" border-color: transparent;\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" animation:\n", | |
" spin 1s steps(1) infinite;\n", | |
" }\n", | |
"\n", | |
" @keyframes spin {\n", | |
" 0% {\n", | |
" border-color: transparent;\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" border-left-color: var(--fill-color);\n", | |
" }\n", | |
" 20% {\n", | |
" border-color: transparent;\n", | |
" border-left-color: var(--fill-color);\n", | |
" border-top-color: var(--fill-color);\n", | |
" }\n", | |
" 30% {\n", | |
" border-color: transparent;\n", | |
" border-left-color: var(--fill-color);\n", | |
" border-top-color: var(--fill-color);\n", | |
" border-right-color: var(--fill-color);\n", | |
" }\n", | |
" 40% {\n", | |
" border-color: transparent;\n", | |
" border-right-color: var(--fill-color);\n", | |
" border-top-color: var(--fill-color);\n", | |
" }\n", | |
" 60% {\n", | |
" border-color: transparent;\n", | |
" border-right-color: var(--fill-color);\n", | |
" }\n", | |
" 80% {\n", | |
" border-color: transparent;\n", | |
" border-right-color: var(--fill-color);\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" }\n", | |
" 90% {\n", | |
" border-color: transparent;\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" }\n", | |
" }\n", | |
"</style>\n", | |
"\n", | |
" <script>\n", | |
" async function quickchart(key) {\n", | |
" const quickchartButtonEl =\n", | |
" document.querySelector('#' + key + ' button');\n", | |
" quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n", | |
" quickchartButtonEl.classList.add('colab-df-spinner');\n", | |
" try {\n", | |
" const charts = await google.colab.kernel.invokeFunction(\n", | |
" 'suggestCharts', [key], {});\n", | |
" } catch (error) {\n", | |
" console.error('Error during call to suggestCharts:', error);\n", | |
" }\n", | |
" quickchartButtonEl.classList.remove('colab-df-spinner');\n", | |
" quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n", | |
" }\n", | |
" (() => {\n", | |
" let quickchartButtonEl =\n", | |
" document.querySelector('#df-eb23c673-c67d-4a2b-ab0e-fae860d855eb button');\n", | |
" quickchartButtonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
" })();\n", | |
" </script>\n", | |
"</div>\n", | |
" </div>\n", | |
" </div>\n" | |
], | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "dataframe", | |
"summary": "{\n \"name\": \"train_data\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"textID\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"27481\",\n \"cb774db0d1\",\n \"1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"27480\",\n \" I`d have responded, if I were going\",\n \"1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"selected_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n 22463,\n \"199\",\n \"27480\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n 3,\n \"11118\",\n \"27481\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" | |
} | |
}, | |
"metadata": {}, | |
"execution_count": 24 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"test_data.describe()" | |
], | |
"metadata": { | |
"execution": { | |
"iopub.status.busy": "2023-12-09T03:06:22.059012Z", | |
"iopub.execute_input": "2023-12-09T03:06:22.059505Z", | |
"iopub.status.idle": "2023-12-09T03:06:22.087533Z", | |
"shell.execute_reply.started": "2023-12-09T03:06:22.059474Z", | |
"shell.execute_reply": "2023-12-09T03:06:22.086136Z" | |
}, | |
"trusted": true, | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 175 | |
}, | |
"id": "ziO__MDtmh6Z", | |
"outputId": "d4ed2938-e7de-4902-cf06-810dbb86a567" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
" textID text \\\n", | |
"count 3534 3534 \n", | |
"unique 3534 3534 \n", | |
"top f87dea47db Last session of the day http://twitpic.com/67ezh \n", | |
"freq 1 1 \n", | |
"\n", | |
" sentiment \n", | |
"count 3534 \n", | |
"unique 3 \n", | |
"top neutral \n", | |
"freq 1430 " | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-f2470ee3-a39b-4d22-8486-884b850c3405\" class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>textID</th>\n", | |
" <th>text</th>\n", | |
" <th>sentiment</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>3534</td>\n", | |
" <td>3534</td>\n", | |
" <td>3534</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>unique</th>\n", | |
" <td>3534</td>\n", | |
" <td>3534</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>top</th>\n", | |
" <td>f87dea47db</td>\n", | |
" <td>Last session of the day http://twitpic.com/67ezh</td>\n", | |
" <td>neutral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>freq</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1430</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <div class=\"colab-df-buttons\">\n", | |
"\n", | |
" <div class=\"colab-df-container\">\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f2470ee3-a39b-4d22-8486-884b850c3405')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
"\n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n", | |
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
"\n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" .colab-df-buttons div {\n", | |
" margin-bottom: 4px;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-f2470ee3-a39b-4d22-8486-884b850c3405 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-f2470ee3-a39b-4d22-8486-884b850c3405');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
"\n", | |
"\n", | |
"<div id=\"df-27dac3cc-b7f4-4396-acdb-6ee946af87e7\">\n", | |
" <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-27dac3cc-b7f4-4396-acdb-6ee946af87e7')\"\n", | |
" title=\"Suggest charts\"\n", | |
" style=\"display:none;\">\n", | |
"\n", | |
"<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <g>\n", | |
" <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n", | |
" </g>\n", | |
"</svg>\n", | |
" </button>\n", | |
"\n", | |
"<style>\n", | |
" .colab-df-quickchart {\n", | |
" --bg-color: #E8F0FE;\n", | |
" --fill-color: #1967D2;\n", | |
" --hover-bg-color: #E2EBFA;\n", | |
" --hover-fill-color: #174EA6;\n", | |
" --disabled-fill-color: #AAA;\n", | |
" --disabled-bg-color: #DDD;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-quickchart {\n", | |
" --bg-color: #3B4455;\n", | |
" --fill-color: #D2E3FC;\n", | |
" --hover-bg-color: #434B5C;\n", | |
" --hover-fill-color: #FFFFFF;\n", | |
" --disabled-bg-color: #3B4455;\n", | |
" --disabled-fill-color: #666;\n", | |
" }\n", | |
"\n", | |
" .colab-df-quickchart {\n", | |
" background-color: var(--bg-color);\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: var(--fill-color);\n", | |
" height: 32px;\n", | |
" padding: 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-quickchart:hover {\n", | |
" background-color: var(--hover-bg-color);\n", | |
" box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: var(--button-hover-fill-color);\n", | |
" }\n", | |
"\n", | |
" .colab-df-quickchart-complete:disabled,\n", | |
" .colab-df-quickchart-complete:disabled:hover {\n", | |
" background-color: var(--disabled-bg-color);\n", | |
" fill: var(--disabled-fill-color);\n", | |
" box-shadow: none;\n", | |
" }\n", | |
"\n", | |
" .colab-df-spinner {\n", | |
" border: 2px solid var(--fill-color);\n", | |
" border-color: transparent;\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" animation:\n", | |
" spin 1s steps(1) infinite;\n", | |
" }\n", | |
"\n", | |
" @keyframes spin {\n", | |
" 0% {\n", | |
" border-color: transparent;\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" border-left-color: var(--fill-color);\n", | |
" }\n", | |
" 20% {\n", | |
" border-color: transparent;\n", | |
" border-left-color: var(--fill-color);\n", | |
" border-top-color: var(--fill-color);\n", | |
" }\n", | |
" 30% {\n", | |
" border-color: transparent;\n", | |
" border-left-color: var(--fill-color);\n", | |
" border-top-color: var(--fill-color);\n", | |
" border-right-color: var(--fill-color);\n", | |
" }\n", | |
" 40% {\n", | |
" border-color: transparent;\n", | |
" border-right-color: var(--fill-color);\n", | |
" border-top-color: var(--fill-color);\n", | |
" }\n", | |
" 60% {\n", | |
" border-color: transparent;\n", | |
" border-right-color: var(--fill-color);\n", | |
" }\n", | |
" 80% {\n", | |
" border-color: transparent;\n", | |
" border-right-color: var(--fill-color);\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" }\n", | |
" 90% {\n", | |
" border-color: transparent;\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" }\n", | |
" }\n", | |
"</style>\n", | |
"\n", | |
" <script>\n", | |
" async function quickchart(key) {\n", | |
" const quickchartButtonEl =\n", | |
" document.querySelector('#' + key + ' button');\n", | |
" quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n", | |
" quickchartButtonEl.classList.add('colab-df-spinner');\n", | |
" try {\n", | |
" const charts = await google.colab.kernel.invokeFunction(\n", | |
" 'suggestCharts', [key], {});\n", | |
" } catch (error) {\n", | |
" console.error('Error during call to suggestCharts:', error);\n", | |
" }\n", | |
" quickchartButtonEl.classList.remove('colab-df-spinner');\n", | |
" quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n", | |
" }\n", | |
" (() => {\n", | |
" let quickchartButtonEl =\n", | |
" document.querySelector('#df-27dac3cc-b7f4-4396-acdb-6ee946af87e7 button');\n", | |
" quickchartButtonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
" })();\n", | |
" </script>\n", | |
"</div>\n", | |
" </div>\n", | |
" </div>\n" | |
], | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "dataframe", | |
"summary": "{\n \"name\": \"test_data\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"textID\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"3534\",\n \"f87dea47db\",\n \"1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"3534\",\n \"Last session of the day http://twitpic.com/67ezh\",\n \"1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n 3,\n \"1430\",\n \"3534\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" | |
} | |
}, | |
"metadata": {}, | |
"execution_count": 25 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Check for null values of train data\n", | |
"train_data.isna().sum()" | |
], | |
"metadata": { | |
"execution": { | |
"iopub.status.busy": "2023-12-09T03:06:02.642708Z", | |
"iopub.execute_input": "2023-12-09T03:06:02.643173Z", | |
"iopub.status.idle": "2023-12-09T03:06:02.666775Z", | |
"shell.execute_reply.started": "2023-12-09T03:06:02.643140Z", | |
"shell.execute_reply": "2023-12-09T03:06:02.665157Z" | |
}, | |
"trusted": true, | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "sW_rLEvTmh6Z", | |
"outputId": "f5974af5-e699-4372-d4de-32d330e96737" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"textID 0\n", | |
"text 1\n", | |
"selected_text 1\n", | |
"sentiment 0\n", | |
"dtype: int64" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 26 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Drop null values\n", | |
"train_data.dropna(inplace=True)" | |
], | |
"metadata": { | |
"execution": { | |
"iopub.status.busy": "2023-12-09T03:06:34.309793Z", | |
"iopub.execute_input": "2023-12-09T03:06:34.310425Z", | |
"iopub.status.idle": "2023-12-09T03:06:34.336322Z", | |
"shell.execute_reply.started": "2023-12-09T03:06:34.310375Z", | |
"shell.execute_reply": "2023-12-09T03:06:34.334742Z" | |
}, | |
"trusted": true, | |
"id": "BYth5ceQmh6Z" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Check for null values of test data\n", | |
"test_data.isna().sum()" | |
], | |
"metadata": { | |
"execution": { | |
"iopub.status.busy": "2023-12-09T03:06:38.689673Z", | |
"iopub.execute_input": "2023-12-09T03:06:38.690104Z", | |
"iopub.status.idle": "2023-12-09T03:06:38.701870Z", | |
"shell.execute_reply.started": "2023-12-09T03:06:38.690071Z", | |
"shell.execute_reply": "2023-12-09T03:06:38.700699Z" | |
}, | |
"trusted": true, | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Gad-bU3qmh6a", | |
"outputId": "b91e3e03-8c82-4993-9f35-4c98313a07eb" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"textID 0\n", | |
"text 0\n", | |
"sentiment 0\n", | |
"dtype: int64" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 28 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Drop null values\n", | |
"test_data.dropna(inplace=True)" | |
], | |
"metadata": { | |
"execution": { | |
"iopub.status.busy": "2023-12-09T03:06:50.905069Z", | |
"iopub.execute_input": "2023-12-09T03:06:50.906112Z", | |
"iopub.status.idle": "2023-12-09T03:06:50.913867Z", | |
"shell.execute_reply.started": "2023-12-09T03:06:50.906067Z", | |
"shell.execute_reply": "2023-12-09T03:06:50.912656Z" | |
}, | |
"trusted": true, | |
"id": "yD8fZWvnmh6a" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Count seniment values\n", | |
"train_data['sentiment'].value_counts()" | |
], | |
"metadata": { | |
"execution": { | |
"iopub.status.busy": "2023-12-09T03:14:49.588524Z", | |
"iopub.execute_input": "2023-12-09T03:14:49.589545Z", | |
"iopub.status.idle": "2023-12-09T03:14:49.604619Z", | |
"shell.execute_reply.started": "2023-12-09T03:14:49.589491Z", | |
"shell.execute_reply": "2023-12-09T03:14:49.603145Z" | |
}, | |
"trusted": true, | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "tIX5rua1mh6b", | |
"outputId": "d7611ee3-cd43-4e04-d405-7dbf3a8b6fcf" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"neutral 11117\n", | |
"positive 8582\n", | |
"negative 7781\n", | |
"Name: sentiment, dtype: int64" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 30 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Analysis text length\n", | |
"train_data['text_length'] = train_data['text'].apply(len)\n", | |
"train_data.groupby('sentiment')['text_length'].mean()" | |
], | |
"metadata": { | |
"execution": { | |
"iopub.status.busy": "2023-12-09T03:06:53.491565Z", | |
"iopub.execute_input": "2023-12-09T03:06:53.492804Z", | |
"iopub.status.idle": "2023-12-09T03:06:53.531364Z", | |
"shell.execute_reply.started": "2023-12-09T03:06:53.492759Z", | |
"shell.execute_reply": "2023-12-09T03:06:53.530181Z" | |
}, | |
"trusted": true, | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "gvJ2x3Rtmh6a", | |
"outputId": "3aff7c0a-44fe-4ca8-80c3-24d8cfd6867a" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"sentiment\n", | |
"negative 70.488112\n", | |
"neutral 65.206800\n", | |
"positive 70.419133\n", | |
"Name: text_length, dtype: float64" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 31 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Analysis selected text length\n", | |
"train_data['selected_text_length'] = train_data['selected_text'].apply(len)\n", | |
"train_data.groupby('sentiment')['selected_text_length'].mean()\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "YMMpNaeUnRag", | |
"outputId": "a551d4d5-ec7a-42a6-9814-63f4a4b08567" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"sentiment\n", | |
"negative 19.970698\n", | |
"neutral 62.765134\n", | |
"positive 18.124680\n", | |
"Name: selected_text_length, dtype: float64" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 32 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# ANOVA results analysis\n", | |
"from scipy import stats\n", | |
"\n", | |
"f_val, p_val = stats.f_oneway(train_data[train_data['sentiment'] == 'positive']['text_length'],\n", | |
" train_data[train_data['sentiment'] == 'negative']['text_length'],\n", | |
" train_data[train_data['sentiment'] == 'neutral']['text_length'])\n", | |
"\n", | |
"print(\"ANOVA Test Results:\")\n", | |
"print(f\"F-statistic: {f_val}\")\n", | |
"print(f\"P-value: {p_val}\")\n", | |
"\n", | |
"# Interpret the results\n", | |
"alpha = 0.05\n", | |
"if p_val<alpha:\n", | |
" print(\"The means of at least two groups are significantly different.\")\n", | |
"else:\n", | |
" print(\"There is no significant difference in the means of the groups.\")" | |
], | |
"metadata": { | |
"execution": { | |
"iopub.status.busy": "2023-12-09T03:17:23.632685Z", | |
"iopub.execute_input": "2023-12-09T03:17:23.633161Z", | |
"iopub.status.idle": "2023-12-09T03:18:41.566408Z", | |
"shell.execute_reply.started": "2023-12-09T03:17:23.633128Z", | |
"shell.execute_reply": "2023-12-09T03:18:41.564637Z" | |
}, | |
"trusted": true, | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "8u8vnTYrmh6b", | |
"outputId": "59af0801-38f3-4f6f-e7d6-3e6ef11982bc" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"ANOVA Test Results:\n", | |
"F-statistic: 72.2127709711816\n", | |
"P-value: 5.254438748898152e-32\n", | |
"The means of at least two groups are significantly different.\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Data Processing" | |
], | |
"metadata": { | |
"id": "j5gAPl2Umh6b" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import re\n", | |
"import string\n", | |
"\n", | |
"def clean_text(text):\n", | |
" text = text.lower()\n", | |
" text = re.sub(r\"what's\", \"what is \", text)\n", | |
" text = re.sub(r\"\\'s\", \" \", text)\n", | |
" text = re.sub(r\"\\'ve\", \" have \", text)\n", | |
" text = re.sub(r\"can't\", \"cannot \", text)\n", | |
" text = re.sub(r\"n't\", \" not \", text)\n", | |
" return text\n", | |
"\n", | |
"train_data['text'] = train_data['text'].apply(clean_text)\n", | |
"test_data['text'] = test_data['text'].apply(clean_text)" | |
], | |
"metadata": { | |
"id": "BphYjewqpCaL" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from tensorflow.keras.preprocessing.text import Tokenizer\n", | |
"\n", | |
"# Tokenize the text\n", | |
"token = Tokenizer(num_words = 500)\n", | |
"token.fit_on_texts(train_data['text'])\n", | |
"\n", | |
"# Convert texts to sequence of integers\n", | |
"train_sequences = token.texts_to_sequences(train_data['text'])\n", | |
"test_sequences = token.texts_to_sequences(test_data['text'])\n", | |
"\n", | |
"# Convert labels to categorical one-hot encoding\n", | |
"train_labels = pd.get_dummies(train_data['sentiment']).values\n", | |
"test_labels = pd.get_dummies(test_data['sentiment']).values\n" | |
], | |
"metadata": { | |
"execution": { | |
"iopub.status.busy": "2023-12-09T03:21:59.215359Z", | |
"iopub.execute_input": "2023-12-09T03:21:59.215864Z", | |
"iopub.status.idle": "2023-12-09T03:22:00.003619Z", | |
"shell.execute_reply.started": "2023-12-09T03:21:59.215829Z", | |
"shell.execute_reply": "2023-12-09T03:22:00.001594Z" | |
}, | |
"trusted": true, | |
"id": "1iM34W3Kmh6b" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Vectorize the text\n", | |
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n", | |
"\n", | |
"max_len = max([len(x) for x in train_sequences])\n", | |
"train_padded = pad_sequences(train_sequences, maxlen = max_len, padding = \"post\", truncating = \"post\")\n", | |
"test_padded = pad_sequences(test_sequences, maxlen = max_len, padding = \"post\", truncating = \"post\")\n", | |
"\n", | |
"print(max_len)\n", | |
"print(train_padded.shape)\n", | |
"print(test_padded.shape)\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "46S4ueo3mh6b", | |
"outputId": "7a345779-d3ea-4656-fdd4-65d819a0ffa2" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"32\n", | |
"(27480, 32)\n", | |
"(3534, 32)\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Build the model" | |
], | |
"metadata": { | |
"id": "_B7z4_rmsNbJ" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from tensorflow.keras.models import Sequential\n", | |
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n", | |
"from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout\n", | |
"from tensorflow.keras.callbacks import EarlyStopping\n", | |
"from tensorflow.keras.preprocessing.text import Tokenizer\n", | |
"from tensorflow.keras.utils import to_categorical\n", | |
"import numpy as np\n", | |
"import tensorflow as tf" | |
], | |
"metadata": { | |
"execution": { | |
"iopub.status.busy": "2023-12-08T08:48:28.404499Z", | |
"iopub.execute_input": "2023-12-08T08:48:28.404989Z", | |
"iopub.status.idle": "2023-12-08T08:48:28.756170Z", | |
"shell.execute_reply.started": "2023-12-08T08:48:28.404953Z", | |
"shell.execute_reply": "2023-12-08T08:48:28.754753Z" | |
}, | |
"trusted": true, | |
"id": "vzFXiRI5mh6b" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"np.random.seed(42)\n", | |
"tf.random.set_seed(42)" | |
], | |
"metadata": { | |
"id": "rmY2dSgAI6wL" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"def lstm_model(max_len):\n", | |
" model = Sequential()\n", | |
"\n", | |
" # Add embedding layer\n", | |
" model.add(Embedding(input_dim = 500, output_dim = 32, input_length = max_len))\n", | |
"\n", | |
" # Add LSTM layer\n", | |
" model.add(LSTM(64, return_sequences=True))\n", | |
" model.add(Dropout(0.5))\n", | |
"\n", | |
" # Add LSTM layer without returning sequences\n", | |
" model.add(LSTM(32))\n", | |
" model.add(Dropout(0.5))\n", | |
"\n", | |
" # Add dense layer\n", | |
" model.add(Dense(3, activation='sigmoid'))\n", | |
"\n", | |
" return model\n", | |
"\n", | |
"def gru_model(max_len):\n", | |
" model = Sequential()\n", | |
"\n", | |
" # Add embedding layer\n", | |
" model.add(Embedding(input_dim = 500, output_dim = 32, input_length = max_len))\n", | |
" # Add GRU layer\n", | |
" model.add(GRU(128, return_sequences=True))\n", | |
" model.add(Dropout(0.5))\n", | |
"\n", | |
" # Add GRU layer without returning sequences\n", | |
" model.add(GRU(32))\n", | |
" model.add(Dropout(0.5))\n", | |
"\n", | |
" # Add dense layer\n", | |
" model.add(Dense(3, activation='sigmoid'))\n", | |
"\n", | |
" return model" | |
], | |
"metadata": { | |
"id": "h51X_LSwsSd1" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"lstm = lstm_model(max_len)\n", | |
"lstm.summary()\n", | |
"# Complie the models\n", | |
"lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])\n", | |
"gru_model(max_len).compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Z2YzFGqu0iIz", | |
"outputId": "c3bd49c0-6aba-418d-aac3-8174818b90ca" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Model: \"sequential\"\n", | |
"_________________________________________________________________\n", | |
" Layer (type) Output Shape Param # \n", | |
"=================================================================\n", | |
" embedding (Embedding) (None, 32, 32) 16000 \n", | |
" \n", | |
" lstm (LSTM) (None, 32, 64) 24832 \n", | |
" \n", | |
" dropout (Dropout) (None, 32, 64) 0 \n", | |
" \n", | |
" lstm_1 (LSTM) (None, 32) 12416 \n", | |
" \n", | |
" dropout_1 (Dropout) (None, 32) 0 \n", | |
" \n", | |
" dense (Dense) (None, 3) 99 \n", | |
" \n", | |
"=================================================================\n", | |
"Total params: 53347 (208.39 KB)\n", | |
"Trainable params: 53347 (208.39 KB)\n", | |
"Non-trainable params: 0 (0.00 Byte)\n", | |
"_________________________________________________________________\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"gru = gru_model(max_len)\n", | |
"gru.summary()\n", | |
"# Complie the models\n", | |
"gru.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "m5zd-TmeBb5p", | |
"outputId": "0907d4cb-5675-4cbd-e6ea-fc08bb9c6368" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Model: \"sequential_2\"\n", | |
"_________________________________________________________________\n", | |
" Layer (type) Output Shape Param # \n", | |
"=================================================================\n", | |
" embedding_2 (Embedding) (None, 32, 32) 16000 \n", | |
" \n", | |
" gru_2 (GRU) (None, 32, 128) 62208 \n", | |
" \n", | |
" dropout_4 (Dropout) (None, 32, 128) 0 \n", | |
" \n", | |
" gru_3 (GRU) (None, 32) 15552 \n", | |
" \n", | |
" dropout_5 (Dropout) (None, 32) 0 \n", | |
" \n", | |
" dense_2 (Dense) (None, 3) 99 \n", | |
" \n", | |
"=================================================================\n", | |
"Total params: 93859 (366.64 KB)\n", | |
"Trainable params: 93859 (366.64 KB)\n", | |
"Non-trainable params: 0 (0.00 Byte)\n", | |
"_________________________________________________________________\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Train the models" | |
], | |
"metadata": { | |
"id": "mSijiLQZ0dk3" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Train the models with early stopping\n", | |
"early_stopping = EarlyStopping(patience=3, restore_best_weights=True)" | |
], | |
"metadata": { | |
"id": "w72HRLgx4e4Q" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"lstm_history = lstm.fit(train_padded, train_labels, epochs=10, validation_split=0.2)" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "BxLCU34-4gtj", | |
"outputId": "d2ccbde3-5a44-446c-ced1-d1f180276c1d" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Epoch 1/10\n", | |
"687/687 [==============================] - 32s 37ms/step - loss: 1.0219 - accuracy: 0.4775 - val_loss: 0.8188 - val_accuracy: 0.6527\n", | |
"Epoch 2/10\n", | |
"687/687 [==============================] - 23s 34ms/step - loss: 0.8102 - accuracy: 0.6598 - val_loss: 0.7817 - val_accuracy: 0.6652\n", | |
"Epoch 3/10\n", | |
"687/687 [==============================] - 25s 37ms/step - loss: 0.7768 - accuracy: 0.6731 - val_loss: 0.7642 - val_accuracy: 0.6678\n", | |
"Epoch 4/10\n", | |
"687/687 [==============================] - 24s 35ms/step - loss: 0.7589 - accuracy: 0.6784 - val_loss: 0.7515 - val_accuracy: 0.6765\n", | |
"Epoch 5/10\n", | |
"687/687 [==============================] - 23s 34ms/step - loss: 0.7517 - accuracy: 0.6802 - val_loss: 0.7628 - val_accuracy: 0.6752\n", | |
"Epoch 6/10\n", | |
"687/687 [==============================] - 24s 35ms/step - loss: 0.7412 - accuracy: 0.6848 - val_loss: 0.7506 - val_accuracy: 0.6809\n", | |
"Epoch 7/10\n", | |
"687/687 [==============================] - 23s 34ms/step - loss: 0.7348 - accuracy: 0.6894 - val_loss: 0.7580 - val_accuracy: 0.6674\n", | |
"Epoch 8/10\n", | |
"687/687 [==============================] - 23s 34ms/step - loss: 0.7282 - accuracy: 0.6905 - val_loss: 0.7830 - val_accuracy: 0.6590\n", | |
"Epoch 9/10\n", | |
"687/687 [==============================] - 23s 34ms/step - loss: 0.7207 - accuracy: 0.6943 - val_loss: 0.7708 - val_accuracy: 0.6661\n", | |
"Epoch 10/10\n", | |
"687/687 [==============================] - 23s 34ms/step - loss: 0.7152 - accuracy: 0.6964 - val_loss: 0.7541 - val_accuracy: 0.6774\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"gru_history= gru.fit(train_padded, train_labels, epochs=10, validation_split=0.2, callbacks=[early_stopping])" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "u8js0eVyBut3", | |
"outputId": "bf75fabb-ff65-4b7d-d179-c655714fceb8" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Epoch 1/10\n", | |
"687/687 [==============================] - 40s 52ms/step - loss: 1.0896 - accuracy: 0.3977 - val_loss: 1.0874 - val_accuracy: 0.4127\n", | |
"Epoch 2/10\n", | |
"687/687 [==============================] - 38s 55ms/step - loss: 1.0894 - accuracy: 0.4025 - val_loss: 1.0858 - val_accuracy: 0.4127\n", | |
"Epoch 3/10\n", | |
"687/687 [==============================] - 35s 50ms/step - loss: 1.0880 - accuracy: 0.4026 - val_loss: 1.0870 - val_accuracy: 0.4127\n", | |
"Epoch 4/10\n", | |
"687/687 [==============================] - 38s 55ms/step - loss: 1.0884 - accuracy: 0.4028 - val_loss: 1.0857 - val_accuracy: 0.4127\n", | |
"Epoch 5/10\n", | |
"687/687 [==============================] - 35s 51ms/step - loss: 1.0883 - accuracy: 0.4025 - val_loss: 1.0855 - val_accuracy: 0.4127\n", | |
"Epoch 6/10\n", | |
"687/687 [==============================] - 37s 54ms/step - loss: 0.9598 - accuracy: 0.5236 - val_loss: 0.7822 - val_accuracy: 0.6658\n", | |
"Epoch 7/10\n", | |
"687/687 [==============================] - 35s 51ms/step - loss: 0.7834 - accuracy: 0.6618 - val_loss: 0.7622 - val_accuracy: 0.6738\n", | |
"Epoch 8/10\n", | |
"687/687 [==============================] - 37s 54ms/step - loss: 0.7639 - accuracy: 0.6724 - val_loss: 0.7650 - val_accuracy: 0.6721\n", | |
"Epoch 9/10\n", | |
"687/687 [==============================] - 35s 51ms/step - loss: 0.7415 - accuracy: 0.6831 - val_loss: 0.7536 - val_accuracy: 0.6734\n", | |
"Epoch 10/10\n", | |
"687/687 [==============================] - 37s 53ms/step - loss: 0.7278 - accuracy: 0.6897 - val_loss: 0.7486 - val_accuracy: 0.6772\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"def jaccard(str1, str2):\n", | |
" a = set(str1.lower().split())\n", | |
" b = set(str2.lower().split())\n", | |
" c = a.intersection(b)\n", | |
" return float(len(c)) / (len(a) + len(b) - len(c))\n", | |
"\n", | |
"def evaluate_model(model, tokenizer, data, true_texts):\n", | |
" \"\"\"\n", | |
" Evaluate the model using the Jaccard score.\n", | |
" - model: The trained model (LSTM or GRU)\n", | |
" - tokenizer: Tokenizer used for the model\n", | |
" - data: The input data for prediction (features)\n", | |
" - true_texts: The true output texts (labels)\n", | |
"\n", | |
" Returns the average Jaccard score for the dataset.\n", | |
" \"\"\"\n", | |
" # Generate predictions\n", | |
" predictions = model.predict(data)\n", | |
"\n", | |
" pred_texts = [\" \".join(tokenizer.sequences_to_texts([p])) for p in predictions]\n", | |
"\n", | |
" # Compute Jaccard scores\n", | |
" scores = [jaccard(pred, true) for pred, true in zip(pred_texts, true_texts)]\n", | |
"\n", | |
" # Return the average Jaccard score\n", | |
" return sum(scores) / len(scores)" | |
], | |
"metadata": { | |
"id": "UGLCcoKzBuDm" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Evaluate on test data\n", | |
"test_loss, test_acc = lstm.evaluate(test_padded, test_labels)\n", | |
"print(\"Test accuracy:\", test_acc)\n", | |
"\n", | |
"test_loss, test_acc = gru.evaluate(test_padded, test_labels)\n", | |
"print(\"Test accuracy:\", test_acc)" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "ZVbw5hb6C9Yg", | |
"outputId": "179c13e3-64bf-4117-8a44-c0e35978fa87" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"111/111 [==============================] - 2s 14ms/step - loss: 0.7518 - accuracy: 0.6672\n", | |
"Test accuracy: 0.6672325730323792\n", | |
"111/111 [==============================] - 2s 16ms/step - loss: 0.7480 - accuracy: 0.6692\n", | |
"Test accuracy: 0.6692133545875549\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Model accuracy\n", | |
"lstm_val_acc = max(lstm_history.history['val_accuracy'])\n", | |
"gru_val_acc = max(gru_history.history['val_accuracy'])\n", | |
"\n", | |
"print(\"LSTM validation accuracy:\", lstm_val_acc)\n", | |
"print(\"GRU validation accuracy:\", gru_val_acc)" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "fTkR4aemFpXS", | |
"outputId": "bf7b8065-4a28-44fb-ffe3-f006b7dd9fdc" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"LSTM validation accuracy: 0.6808587908744812\n", | |
"GRU validation accuracy: 0.677219808101654\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Confusion matrix\n", | |
"from sklearn.metrics import classification_report, confusion_matrix\n", | |
"lstm_pred = lstm.predict(test_padded)\n", | |
"gru_pred = gru.predict(test_padded)\n", | |
"\n", | |
"print(classification_report(test_labels.argmax(axis=1), lstm_pred.argmax(axis=1)))\n", | |
"print(classification_report(test_labels.argmax(axis=1), gru_pred.argmax(axis=1)))" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Vu1OEfmuMd5S", | |
"outputId": "b866ec81-2d9f-440a-9cf5-3eadacc159bc" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"111/111 [==============================] - 2s 11ms/step\n", | |
"111/111 [==============================] - 2s 14ms/step\n", | |
" precision recall f1-score support\n", | |
"\n", | |
" 0 0.78 0.46 0.58 1001\n", | |
" 1 0.58 0.79 0.67 1430\n", | |
" 2 0.76 0.70 0.73 1103\n", | |
"\n", | |
" accuracy 0.67 3534\n", | |
" macro avg 0.71 0.65 0.66 3534\n", | |
"weighted avg 0.70 0.67 0.66 3534\n", | |
"\n", | |
" precision recall f1-score support\n", | |
"\n", | |
" 0 0.75 0.47 0.58 1001\n", | |
" 1 0.58 0.79 0.67 1430\n", | |
" 2 0.79 0.69 0.74 1103\n", | |
"\n", | |
" accuracy 0.67 3534\n", | |
" macro avg 0.71 0.65 0.66 3534\n", | |
"weighted avg 0.69 0.67 0.67 3534\n", | |
"\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "25XNKkNYCQae" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from datetime import datetime\n", | |
"\n", | |
"def save_model(model, prefix =''):\n", | |
" # Get the current time and filename\n", | |
" current_time = datetime.now().strftime(\"%Y-%m-%d-%H%M%S\")\n", | |
" filename = f\"{prefix}model_{current_time}.csv\"\n", | |
" # Save the models\n", | |
" model.save(filename)\n", | |
" print(f\"Model saved to {filename}\")\n", | |
"\n", | |
"save_model(lstm, prefix = \"lstm\")\n", | |
"save_model(gru, prefix = \"gru\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "LYqnTQa-bdqn", | |
"outputId": "e248dd33-fa4a-4c52-88c2-01d216316d18" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Model saved to lstmmodel_2024-03-04-015021.csv\n", | |
"Model saved to grumodel_2024-03-04-015029.csv\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install --upgrade kaggle" | |
], | |
"metadata": { | |
"id": "vgjLVo-l_LO9" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"! kaggle competitions submit -c tweet-sentiment-extraction -f '/content/lstmmodel_2024-03-04-005517.csv' -m late-submission" | |
], | |
"metadata": { | |
"id": "kmaX09fo9Urz", | |
"outputId": "24134994-354d-4d1b-8ac2-9e7ab86e3994", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Traceback (most recent call last):\n", | |
" File \"/usr/local/bin/kaggle\", line 8, in <module>\n", | |
" sys.exit(main())\n", | |
" File \"/usr/local/lib/python3.10/dist-packages/kaggle/cli.py\", line 70, in main\n", | |
" out = args.func(**command_args)\n", | |
" File \"/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py\", line 801, in competition_submit_cli\n", | |
" submit_result = self.competition_submit(file_name, message,\n", | |
" File \"/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py\", line 752, in competition_submit\n", | |
" content_length=os.path.getsize(file_name),\n", | |
" File \"/usr/lib/python3.10/genericpath.py\", line 50, in getsize\n", | |
" return os.stat(filename).st_size\n", | |
"FileNotFoundError: [Errno 2] No such file or directory: '/content/lstmmodel_2024-03-04-005517.csv'\n" | |
] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment