Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save sudip-mondal-2002/67c735d0e1cbd768b698e47dd1404f58 to your computer and use it in GitHub Desktop.
Save sudip-mondal-2002/67c735d0e1cbd768b698e47dd1404f58 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Copy of FakeNewsDetectionUsingLSTM.ipynb",
"provenance": [],
"collapsed_sections": [],
"machine_shape": "hm"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "yen7XGzttxem",
"outputId": "bf0607a3-1626-45d9-9b3e-6ddcc8a49923"
},
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive')"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Mounted at /content/drive\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "NRI7QElNX1ST"
},
"source": [
"# Downloading the dependencies\n",
"\n",
"- Downloading the dataset from kaggle using the kaggle API\n",
"- Downloading pretrained GloVe embeddings"
]
},
{
"cell_type": "code",
"metadata": {
"id": "xl6JPS-GuTqh"
},
"source": [
"from IPython.display import clear_output\n",
"\n",
"!pip install kaggle\n",
"%env KAGGLE_USERNAME=xerefic\n",
"%env KAGGLE_KEY=83aac7088c3bb8150fcf8197ab22c67b\n",
"\n",
"!kaggle competitions download -c fake-news\n",
"!unzip /content/train.csv.zip\n",
"!unzip /content/test.csv.zip\n",
"!rm *.zip\n",
"\n",
"clear_output()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "B6Xay_piI0GH"
},
"source": [
"!wget https://nlp.stanford.edu/data/glove.6B.zip\n",
"!mkdir embeddings\n",
"!mkdir embeddings/glove.6B\n",
"!unzip /content/glove.6B.zip -d \"/content/embeddings/glove.6B/\"\n",
"\n",
"clear_output()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "2J2M_Ur-uXF3"
},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "e_AiF3bjYEeq"
},
"source": [
"# Importing Libraries"
]
},
{
"cell_type": "code",
"metadata": {
"id": "tu6wb15KuU0U"
},
"source": [
"import re\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import os\n",
"import gc\n",
"import sys\n",
"import pickle\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "45cu-piJugZE"
},
"source": [
"from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout\n",
"from tensorflow.keras.models import Sequential\n",
"from tensorflow.keras.layers.experimental.preprocessing import TextVectorization\n",
"import keras"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "FulQCI7EwhHN"
},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_ONcF1cgYLT3"
},
"source": [
"# Processing the Dataset\n",
"\n",
"Concatenating the title and text to increase the learning scope of our model."
]
},
{
"cell_type": "code",
"metadata": {
"id": "PTb3Vj1U7ZGT"
},
"source": [
"train = pd.read_csv('/content/train.csv')\n",
"test = pd.read_csv('/content/test.csv')\n",
"\n",
"train = train.dropna()\n",
"test = test.dropna()\n",
"\n",
"train['text'] = train['title']+train['text']\n",
"test['text'] = test['title']+test['text']\n",
"\n",
"train = train.drop(columns=['id', 'title', 'author'])\n",
"test = test.drop(columns=['id', 'title', 'author'])"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "lm5myaGUYXwE"
},
"source": [
"Visualizing the dataset"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"id": "-Nb-2U2aPU1p",
"outputId": "bc5be4ed-c863-4107-e2a0-5802348c0bc8"
},
"source": [
"train.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>House Dem Aide: We Didn’t Even See Comey’s Let...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>FLYNN: Hillary Clinton, Big Woman on Campus - ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Why the Truth Might Get You FiredWhy the Truth...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>15 Civilians Killed In Single US Airstrike Hav...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Iranian woman jailed for fictional unpublished...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 House Dem Aide: We Didn’t Even See Comey’s Let... 1\n",
"1 FLYNN: Hillary Clinton, Big Woman on Campus - ... 0\n",
"2 Why the Truth Might Get You FiredWhy the Truth... 1\n",
"3 15 Civilians Killed In Single US Airstrike Hav... 1\n",
"4 Iranian woman jailed for fictional unpublished... 1"
]
},
"metadata": {
"tags": []
},
"execution_count": 7
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "OucLYeCpYbAX"
},
"source": [
"Converting into a iterable format"
]
},
{
"cell_type": "code",
"metadata": {
"id": "TEApOLWXPtph"
},
"source": [
"x_train = train['text'].to_numpy()\n",
"y_train = train['label'].to_numpy()\n",
"x_test = test['text'].to_numpy()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "9-dzvXYma4D5"
},
"source": [
"## Visualizing the data"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 296
},
"id": "g5BdAuSSa3tL",
"outputId": "74df59a5-9b46-4578-dfcd-9bde0dda42aa"
},
"source": [
"sns.countplot(x='label', data=train)"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7eff1bc3e590>"
]
},
"metadata": {
"tags": []
},
"execution_count": 22
},
{
"output_type": "display_data",
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZEAAAEGCAYAAACkQqisAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAQcUlEQVR4nO3df+xddX3H8edLCqJzQpGGsRbXbjY6dDNCA6iJmbJAYZtlBg1OR8eadYlsumW/cH+sC8qimY6pmySNVIojIqIbbHOSpv6KiyCtMPlRCd+g2DZgv9KCv4Jafe+P+/nqFb/Fy6f93tsv3+cjufme8/58zrnvkzR95Zx77rmpKiRJ6vGUSTcgSZq/DBFJUjdDRJLUzRCRJHUzRCRJ3RZNuoFxO/7442v58uWTbkOS5o3t27d/vaqWzDa24EJk+fLlbNu2bdJtSNK8keT+A415OUuS1M0QkSR1M0QkSd0MEUlSN0NEktTNEJEkdTNEJEndDBFJUjdDRJLUbcF9Y/1gnfpXV0+6BR2Gtv/jhZNuQZoIz0QkSd0MEUlSN0NEktRtzkIkyaYke5LcOVQ7LsmWJPe2v4tbPUnenWQqyReTnDK0zdo2/94ka4fqpya5o23z7iSZq2ORJM1uLs9ErgJWP6Z2CbC1qlYCW9s6wDnAyvZaD1wBg9ABNgCnA6cBG2aCp835o6HtHvtekqQ5NmchUlWfAfY+prwG2NyWNwPnDdWvroGbgWOTnAicDWypqr1VtQ/YAqxuY8+sqpurqoCrh/YlSRqTcX8mckJVPdCWHwROaMtLgZ1D83a12uPVd81Sn1WS9Um2Jdk2PT19cEcgSfqRiX2w3s4gakzvtbGqVlXVqiVLZv2FR0lSh3GHyNfapSja3z2tvhs4aWjeslZ7vPqyWeqSpDEad4jcCMzcYbUWuGGofmG7S+sM4JF22esm4Kwki9sH6mcBN7WxbyQ5o92VdeHQviRJYzJnjz1J8kHgN4Djk+xicJfV24DrkqwD7gde06Z/DDgXmAK+A1wEUFV7k7wFuLXNu7SqZj6sfwODO8CeBvxPe0mSxmjOQqSqXnuAoTNnmVvAxQfYzyZg0yz1bcALDqZHSdLB8RvrkqRuhogkqZshIknqZohIkroZIpKkboaIJKmbISJJ6maISJK6GSKSpG6GiCSpmyEiSepmiEiSuhkikqRuhogkqZshIknqZohIkroZIpKkboaIJKmbISJJ6maISJK6GSKSpG6GiCSpmyEiSepmiEiSuhkikqRuhogkqZshIknqZohIkroZIpKkboaIJKmbISJJ6jaREEny50nuSnJnkg8mOTrJiiS3JJlK8qEkR7W5T23rU218+dB+3tzq9yQ5exLHIkkL2dhDJMlS4I3Aqqp6AXAEcAHwduDyqnoOsA9Y1zZZB+xr9cvbPJKc3LZ7PrAaeG+SI8Z5LJK00E3qctYi4GlJFgFPBx4AXgFc38Y3A+e15TVtnTZ+ZpK0+rVV9d2q+jIwBZw2pv4lSUwgRKpqN/AO4KsMwuMRYDvwcFXtb9N2AUvb8lJgZ9t2f5v/rOH6LNv8hCTrk2xLsm16evrQHpAkLWCLxv2GSRYzOItYATwMfJjB5ag5U1UbgY0Aq1atqrl8L2mSvnrpr026BR2Gnv13d8zZvidxOes3gS9X1XRVfR/4KPBS4Nh2eQtgGbC7Le8GTgJo48cADw3XZ9lGkjQGkwiRrwJnJHl6+2zjTOBu4JPA+W3OWuCGtnxjW6eNf6KqqtUvaHdvrQBWAp8f0zFIkpjA5ayquiXJ9cAXgP3AbQwuNf03cG2St7balW2TK4EPJJkC9jK4I4uquivJdQwCaD9wcVX9YKwHI0kL3NhDBKCqNgAbHlO+j1nurqqqR4FXH2A/lwGXHfIGJUkj8RvrkqRuhogkqZshIknqZohIkroZIpKkboaIJKmbISJJ6maISJK6GSKSpG6GiCSpmyEiSepmiEiSuhkikqRuhogkqZshIknqZohIkroZIpKkboaIJKmbISJJ6maISJK6GSKSpG6GiCSpmyEiSepmiEiSuhkikqRuhogkqZshIknqZohIkroZIpKkboaIJKmbISJJ6jaREElybJLrk3wpyY4kL05yXJItSe5tfxe3uUny7iRTSb6Y5JSh/axt8+9NsnYSxyJJC9mkzkTeBXy8qp4HvBDYAVwCbK2qlcDWtg5wDrCyvdYDVwAkOQ7YAJwOnAZsmAkeSdJ4jD1EkhwDvAy4EqCqvldVDwNrgM1t2mbgvLa8Bri6Bm4Gjk1yInA2sKWq9lbVPmALsHqMhyJJC95IIZJk6yi1Ea0ApoH3J7ktyfuS/BxwQlU90OY8CJzQlpcCO4e239VqB6rP1v/6JNuSbJuenu5sW5L0WI8bIkmObpeNjk+yuH1ucVyS5RzgP+wRLAJOAa6oqhcB3+bHl64AqKoCqnP/P6WqNlbVqqpatWTJkkO1W0la8H7WmcgfA9uB57W/M68bgH/pfM9dwK6quqWtX88gVL7WLlPR/u5p47uBk4a2X9ZqB6pLksbkcUOkqt5VVSuAv6yqX66qFe31wqrqCpGqehDYmeS5rXQmcDdwIzBzh9VaBkFFq1/Y7tI6A3ikXfa6CTirnSEtBs5qNUnSmCwaZVJVvSfJS4Dlw9tU1dWd7/unwDVJjgLuAy5iEGjXJVkH3A+8ps39GHAuMAV8p82lqvYmeQtwa5t3aVXt7exHktRhpBBJ8gHgV4DbgR+0cgFdIVJVtwOrZhk6c5a5BVx8gP1sAjb19CBJOngjhQiD//BPbv+hS5IEjP49kTuBX5jLRiRJ88+oZyLHA3cn+Tzw3ZliVb1yTrqSJM0Lo4bI389lE5Kk+WnUu7M+PdeNSJLmn1HvzvomP/4G+VHAkcC3q+qZc9WYJOnwN+qZyM/PLCcJg4cinjFXTUmS5ocn/BTf9jTd/2DwFF1J0gI26uWsVw2tPoXB90YenZOOJEnzxqh3Z/3O0PJ+4CsMLmlJkhawUT8TuWiuG5EkzT+j/ijVsiT/nmRPe30kybK5bk6SdHgb9YP19zN4JPsvttd/tpokaQEbNUSWVNX7q2p/e10F+BOBkrTAjRoiDyV5fZIj2uv1wENz2Zgk6fA3aoj8IYMfiXoQeAA4H/iDOepJkjRPjHqL76XA2qraB5DkOOAdDMJFkrRAjXom8uszAQKDn6YFXjQ3LUmS5otRQ+QpSRbPrLQzkVHPYiRJT1KjBsE7gc8l+XBbfzVw2dy0JEmaL0b9xvrVSbYBr2ilV1XV3XPXliRpPhj5klQLDYNDkvQjT/hR8JIkzTBEJEndDBFJUjdDRJLUzRCRJHUzRCRJ3QwRSVI3Q0SS1M0QkSR1m1iItB+3ui3Jf7X1FUluSTKV5ENJjmr1p7b1qTa+fGgfb271e5KcPZkjkaSFa5JnIm8Cdgytvx24vKqeA+wD1rX6OmBfq1/e5pHkZOAC4PnAauC9SY4YU++SJCYUIkmWAb8FvK+th8HDHa9vUzYD57XlNW2dNn5mm78GuLaqvltVXwamgNPGcwSSJJjcmcg/A38N/LCtPwt4uKr2t/VdwNK2vBTYCdDGH2nzf1SfZZufkGR9km1Jtk1PTx/K45CkBW3sIZLkt4E9VbV9XO9ZVRuralVVrVqyZMm43laSnvQm8euELwVemeRc4GjgmcC7gGOTLGpnG8uA3W3+buAkYFeSRcAxwEND9RnD20iSxmDsZyJV9eaqWlZVyxl8MP6Jqnod8Eng/DZtLXBDW76xrdPGP1FV1eoXtLu3VgArgc+P6TAkSRxev5P+N8C1Sd4K3AZc2epXAh9IMgXsZRA8VNVdSa5j8ENZ+4GLq+oH429bkhauiYZIVX0K+FRbvo9Z7q6qqkcZ/Kb7bNtfhr/1LkkT4zfWJUndDBFJUjdDRJLUzRCRJHUzRCRJ3QwRSVI3Q0SS1M0QkSR1M0QkSd0MEUlSN0NEktTNEJEkdTNEJEndDBFJUjdDRJLUzRCRJHUzRCRJ3QwRSVI3Q0SS1M0QkSR1M0QkSd0MEUlSN0NEktTNEJEkdTNEJEndDBFJUjdDRJLUzRCRJHUzRCRJ3QwRSVI3Q0SS1G3sIZLkpCSfTHJ3kruSvKnVj0uyJcm97e/iVk+SdyeZSvLFJKcM7Wttm39vkrXjPhZJWugmcSayH/iLqjoZOAO4OMnJwCXA1qpaCWxt6wDnACvbaz1wBQxCB9gAnA6cBmyYCR5J0niMPUSq6oGq+kJb/iawA1gKrAE2t2mbgfPa8hrg6hq4GTg2yYnA2cCWqtpbVfuALcDqMR6KJC14E/1MJMly4EXALcAJVfVAG3oQOKEtLwV2Dm22q9UOVJ/tfdYn2ZZk2/T09CHrX5IWuomFSJJnAB8B/qyqvjE8VlUF1KF6r6raWFWrqmrVkiVLDtVuJWnBm0iIJDmSQYBcU1UfbeWvtctUtL97Wn03cNLQ5sta7UB1SdKYTOLurABXAjuq6p+Ghm4EZu6wWgvcMFS/sN2ldQbwSLvsdRNwVpLF7QP1s1pNkjQmiybwni8Ffh+4I8ntrfa3wNuA65KsA+4HXtPGPgacC0wB3wEuAqiqvUneAtza5l1aVXvHcwiSJJhAiFTVZ4EcYPjMWeYXcPEB9rUJ2HToupMkPRF+Y12S1M0QkSR1M0QkSd0MEUlSN0NEktTNEJEkdTNEJEndDBFJUjdDRJLUzRCRJHUzRCRJ3QwRSVI3Q0SS1M0QkSR1M0QkSd0MEUlSN0NEktTNEJEkdTNEJEndDBFJUjdDRJLUzRCRJHUzRCRJ3QwRSVI3Q0SS1M0QkSR1M0QkSd0MEUlSN0NEktTNEJEkdTNEJEnd5n2IJFmd5J4kU0kumXQ/krSQzOsQSXIE8K/AOcDJwGuTnDzZriRp4ZjXIQKcBkxV1X1V9T3gWmDNhHuSpAVj0aQbOEhLgZ1D67uA0x87Kcl6YH1b/VaSe8bQ20JwPPD1STdxOMg71k66Bf00/33O2JCD3cMvHWhgvofISKpqI7Bx0n082STZVlWrJt2HNBv/fY7HfL+ctRs4aWh9WatJksZgvofIrcDKJCuSHAVcANw44Z4kacGY15ezqmp/kj8BbgKOADZV1V0Tbmsh8RKhDmf++xyDVNWke5AkzVPz/XKWJGmCDBFJUjdDRF183IwOV0k2JdmT5M5J97IQGCJ6wnzcjA5zVwGrJ93EQmGIqIePm9Fhq6o+A+yddB8LhSGiHrM9bmbphHqRNEGGiCSpmyGiHj5uRhJgiKiPj5uRBBgi6lBV+4GZx83sAK7zcTM6XCT5IPA54LlJdiVZN+mensx87IkkqZtnIpKkboaIJKmbISJJ6maISJK6GSKSpG6GiDSHknzrZ4wvf6JPm01yVZLzD64z6dAwRCRJ3QwRaQySPCPJ1iRfSHJHkuGnHi9Kck2SHUmuT/L0ts2pST6dZHuSm5KcOKH2pQMyRKTxeBT43ao6BXg58M4kaWPPBd5bVb8KfAN4Q5IjgfcA51fVqcAm4LIJ9C09rkWTbkBaIAL8Q5KXAT9k8Oj8E9rYzqr637b8b8AbgY8DLwC2tKw5AnhgrB1LIzBEpPF4HbAEOLWqvp/kK8DRbeyxzx4qBqFzV1W9eHwtSk+cl7Ok8TgG2NMC5OXALw2NPTvJTFj8HvBZ4B5gyUw9yZFJnj/WjqURGCLSeFwDrEpyB3Ah8KWhsXuAi5PsABYDV7SfHT4feHuS/wNuB14y5p6ln8mn+EqSunkmIknqZohIkroZIpKkboaIJKmbISJJ6maISJK6GSKSpG7/D04JLgF8F2rcAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "oD-B7DGVQurh"
},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ti-JEHayYfrw"
},
"source": [
"# Creating the Vocabulary"
]
},
{
"cell_type": "code",
"metadata": {
"id": "SUXL7sKHI4Ln"
},
"source": [
"embeddings = {}\n",
"with open(\"/content/embeddings/glove.6B/glove.6B.50d.txt\", 'r') as f:\n",
" for line in f:\n",
" values = line.split()\n",
" word = \" \".join(t for t in values[:-50])\n",
" vector = np.asarray(values[-50:], \"float64\")\n",
" embeddings[word] = vector"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Pq6Xgx_XQwGl"
},
"source": [
"vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)\n",
"vectorizer.adapt(x_train)\n",
"voc = vectorizer.get_vocabulary()\n",
"word_index = dict(zip(voc, range(len(voc))))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qhc31xjmRZgA",
"outputId": "fb837432-8ae4-4f5c-f589-ec278f7cbb0a"
},
"source": [
"num_tokens = len(voc) + 2\n",
"embedding_dim = 50\n",
"hits = 0\n",
"misses = 0\n",
"\n",
"embedding_matrix = np.zeros((num_tokens, embedding_dim))\n",
"for word, i in word_index.items():\n",
" embedding_vector = embeddings.get(word)\n",
" if embedding_vector is not None:\n",
" embedding_matrix[i] = embedding_vector\n",
" hits += 1\n",
" else:\n",
" misses += 1\n",
"print(\"Converted %d words (%d misses)\" % (hits, misses))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Converted 18146 words (1854 misses)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ufVRRppeYjgm"
},
"source": [
"Saving the vocabulary data to load during testing phase"
]
},
{
"cell_type": "code",
"metadata": {
"id": "pkoALiNWxjL7"
},
"source": [
"DUMP = \"/content/drive/MyDrive/Projects/Hackathons/FakeNews-Team_Hackers/checkpoints/embeddings/\"\n",
"\n",
"pickle.dump({'config': vectorizer.get_config(),\n",
" 'weights': vectorizer.get_weights()}, \n",
" open(os.path.join(DUMP, \"vectorizer.pkl\"), \"wb\"))\n",
"\n",
"pickle.dump(embedding_matrix,\n",
" open(os.path.join(DUMP, \"embedding.pkl\"), \"wb\"))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "swCcDln-IFR2"
},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "b5RvMu4KYqEM"
},
"source": [
"# Model Architecture"
]
},
{
"cell_type": "code",
"metadata": {
"id": "oSGbee7YFCEK"
},
"source": [
"model = Sequential()\n",
"\n",
"model.add(Embedding(num_tokens, embedding_dim, keras.initializers.Constant(embedding_matrix), trainable=False ))\n",
"model.add(Dropout(0.5))\n",
"\n",
"model.add(LSTM(384))\n",
"model.add(Dropout(0.5))\n",
"\n",
"model.add(Dense(1, activation=\"sigmoid\"))\n",
"\n",
"model.compile(loss=\"binary_crossentropy\",optimizer=\"adam\",metrics=[\"accuracy\"])"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pWZg8suoTSzw",
"outputId": "a17a67df-981c-4100-f371-2353a6859e68"
},
"source": [
"model.summary()"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Model: \"sequential\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"embedding (Embedding) (None, None, 50) 1000100 \n",
"_________________________________________________________________\n",
"dropout (Dropout) (None, None, 50) 0 \n",
"_________________________________________________________________\n",
"lstm (LSTM) (None, 384) 668160 \n",
"_________________________________________________________________\n",
"dropout_1 (Dropout) (None, 384) 0 \n",
"_________________________________________________________________\n",
"dense (Dense) (None, 1) 385 \n",
"=================================================================\n",
"Total params: 1,668,645\n",
"Trainable params: 668,545\n",
"Non-trainable params: 1,000,100\n",
"_________________________________________________________________\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XXo2-NkZYsoi"
},
"source": [
"# Training Phase"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_hXkfU84Yvos"
},
"source": [
"Converting the input to the encoding format"
]
},
{
"cell_type": "code",
"metadata": {
"id": "eogE0KHIToJ2"
},
"source": [
"x_train = vectorizer(np.array([[s] for s in x_train])).numpy()\n",
"x_test = vectorizer(np.array([[s] for s in x_test])).numpy()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "IRcdBBFYY4KU"
},
"source": [
"## Training the model"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "eE2565IdPNmx",
"outputId": "2617a464-6f09-4769-b5e3-519f1bf922ae"
},
"source": [
"history = model.fit(x_train, y_train, epochs = 16, batch_size = 64)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Epoch 1/16\n",
"286/286 [==============================] - 12s 27ms/step - loss: 0.6391 - accuracy: 0.6444\n",
"Epoch 2/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.6290 - accuracy: 0.6494\n",
"Epoch 3/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.6309 - accuracy: 0.6580\n",
"Epoch 4/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.6586 - accuracy: 0.6187\n",
"Epoch 5/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.6598 - accuracy: 0.6148\n",
"Epoch 6/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.6515 - accuracy: 0.6322\n",
"Epoch 7/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.6454 - accuracy: 0.6339\n",
"Epoch 8/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.6310 - accuracy: 0.6472\n",
"Epoch 9/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.6087 - accuracy: 0.6734\n",
"Epoch 10/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.5630 - accuracy: 0.7189\n",
"Epoch 11/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.4648 - accuracy: 0.7917\n",
"Epoch 12/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.3497 - accuracy: 0.8490\n",
"Epoch 13/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.2789 - accuracy: 0.8850\n",
"Epoch 14/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.2597 - accuracy: 0.8883\n",
"Epoch 15/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.2222 - accuracy: 0.9117\n",
"Epoch 16/16\n",
"286/286 [==============================] - 8s 27ms/step - loss: 0.1941 - accuracy: 0.9228\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4atVNRj-Y8dC"
},
"source": [
"Saving the model weights to load during the test phase"
]
},
{
"cell_type": "code",
"metadata": {
"id": "6P6YFQcKTdsL"
},
"source": [
"model.save_weights(\"/content/drive/MyDrive/Projects/Hackathons/FakeNews-Team_Hackers/checkpoints/embeddings/\")"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZT2q50YLZ8Sl"
},
"source": [
"## Visualizing the Loss and Accuracy"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 295
},
"id": "pdpZF57aaC29",
"outputId": "9669b7e4-474b-45e1-ed00-3f139bd0067c"
},
"source": [
"plt.plot(history.history['accuracy'])\n",
"plt.title('Model Accuracy')\n",
"plt.ylabel('Accuracy')\n",
"plt.xlabel('Epochs')\n",
"plt.legend(['train'], loc='best')\n",
"plt.show()"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 295
},
"id": "Iv1iGfh_aLXg",
"outputId": "d7105b4f-2806-48c4-bd40-e71d9ab68b0b"
},
"source": [
"plt.plot(history.history['loss'])\n",
"plt.title('Model loss')\n",
"plt.ylabel('loss')\n",
"plt.xlabel('Epochs')\n",
"plt.legend(['train'], loc='best')\n",
"plt.show()"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9vxjwBRwu9Ck"
},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "8tTTeUYNZCFF"
},
"source": [
"# Testing Phase"
]
},
{
"cell_type": "code",
"metadata": {
"id": "7YqesBrLZYSI"
},
"source": [
"DUMP = \"/content/drive/MyDrive/Projects/Hackathons/FakeNews-Team_Hackers/checkpoints/embeddings/\""
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Pz_Y4mfgvP4t"
},
"source": [
"def isFake(text, DUMP):\n",
" vectorizer_disk = pickle.load(open(os.path.join(DUMP, \"vectorizer.pkl\"), \"rb\"))\n",
" embedding_matrix = pickle.load(open(os.path.join(DUMP, \"embedding.pkl\"), \"rb\"))\n",
"\n",
" vectorizer = TextVectorization.from_config(vectorizer_disk['config'])\n",
" vectorizer.adapt(tensorflow.data.Dataset.from_tensor_slices([\"xyz\"]))\n",
" vectorizer.set_weights(vectorizer_disk['weights'])\n",
"\n",
" embedding_dim = 50\n",
"\n",
" model = Sequential()\n",
" model.add(Embedding(len(embedding_matrix), embedding_dim, keras.initializers.Constant(embedding_matrix), trainable=False))\n",
" model.add(Dropout(0.5))\n",
" model.add(LSTM(384))\n",
" model.add(Dropout(0.5))\n",
" model.add(Dense(1, activation=\"sigmoid\"))\n",
" model.compile(loss=\"binary_crossentropy\",optimizer=\"adam\",metrics=[\"accuracy\"])\n",
" model.load_weights(DUMP)\n",
"\n",
" text = np.array(text)\n",
" text = vectorizer([text]).numpy()\n",
" y_preds = model.predict(text)\n",
"\n",
" return \"The news is {0} % Fake\".format(int(y_preds.item()*10000)/100)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "1hcVTWgIZTRk"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment