sudip-mondal-2002/FakeNewsDetectionUsingLSTM_pytorch.ipynb

## FakeNewsDetectionUsingLSTM_pytorch.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "FakeNewsDetectionUsingLSTM-pytorch.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "machine_shape": "hm"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "49K4Q0GuMbYG",
        "outputId": "b3f661d4-83d0-4397-adef-dd5973f13326"
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Mounted at /content/drive\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bMJchN9xckiW"
      },
      "source": [
        "# Downloading the dependencies\n",
        "\n",
        "- Downloading the dataset from kaggle using the kaggle API\n",
        "- Downloading pretrained GloVe embeddings"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lYIkzQE_JGi0"
      },
      "source": [
        "from IPython.display import clear_output\n",
        "\n",
        "!pip install kaggle\n",
        "%env KAGGLE_USERNAME=xerefic\n",
        "%env KAGGLE_KEY=83aac7088c3bb8150fcf8197ab22c67b\n",
        "\n",
        "!kaggle competitions download -c fake-news\n",
        "!unzip /content/train.csv.zip\n",
        "!unzip /content/test.csv.zip\n",
        "!rm *.zip\n",
        "\n",
        "clear_output()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GG2qBprzMr5P"
      },
      "source": [
        "!wget https://nlp.stanford.edu/data/glove.840B.300d.zip\n",
        "!mkdir embeddings \n",
        "!mkdir embeddings/glove.840B.300d\n",
        "!unzip /content/glove.840B.300d.zip -d \"/content/embeddings/glove.840B.300d\"\n",
        "\n",
        "clear_output()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NaVaT-qXKmbj"
      },
      "source": [
        "---"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "W70DcfG2cxwt"
      },
      "source": [
        "# Processing the Dataset\n",
        "\n",
        "Concatenating the title and text to increase the learning scope of our model."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FRUnqWCLKkVo"
      },
      "source": [
        "import pandas as pd\n",
        "import os"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 221
        },
        "id": "uhaxPOWVKnLj",
        "outputId": "bc79ec81-8c45-470c-ae8b-99f19a3d9b35"
      },
      "source": [
        "data = pd.read_csv(\"/content/train.csv\")\n",
        "data = data.drop(columns=[\"id\", \"title\", \"author\"])"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "20800\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>text</th>\n",
              "      <th>label</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>House Dem Aide: We Didn’t Even See Comey’s Let...</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Ever get the feeling your life circles the rou...</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Why the Truth Might Get You Fired October 29, ...</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Videos 15 Civilians Killed In Single US Airstr...</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Print \\nAn Iranian woman has been sentenced to...</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                text  label\n",
              "0  House Dem Aide: We Didn’t Even See Comey’s Let...      1\n",
              "1  Ever get the feeling your life circles the rou...      0\n",
              "2  Why the Truth Might Get You Fired October 29, ...      1\n",
              "3  Videos 15 Civilians Killed In Single US Airstr...      1\n",
              "4  Print \\nAn Iranian woman has been sentenced to...      1"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 7
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "mYD_-B2ELjrB",
        "outputId": "e2f208ef-7e3d-437b-d251-c2b813606873"
      },
      "source": [
        "o_class = data.loc[data.label == 0, :]\n",
        "l_class = data.loc[data.label == 1, :]\n",
        "print(len(o_class))\n",
        "print(len(l_class))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "10387\n",
            "10413\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0Ka9chiqL-OI"
      },
      "source": [
        "valid_o = o_class.iloc[:1000, :]\n",
        "valid_l = l_class.iloc[:1000, :]\n",
        "\n",
        "train_o = o_class.iloc[1000:, :]\n",
        "train_l = l_class.iloc[1000:, :]"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "m42GmaOmMJrA",
        "outputId": "87a7621e-81ce-4883-9b26-a796781ffc7d"
      },
      "source": [
        "train = pd.concat([train_o, train_l], axis=0)\n",
        "print(train.shape)\n",
        "\n",
        "valid = pd.concat([valid_o, valid_l], axis=0)\n",
        "print(valid.shape)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "(18800, 2)\n",
            "(2000, 2)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "m0LMbhozMNav"
      },
      "source": [
        "!mkdir inputs\n",
        "\n",
        "train.to_csv(\"/content/inputs/train.csv\", index=False)\n",
        "valid.to_csv(\"/content/inputs/valid.csv\", index=False)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "FOSzTY8HdzEh"
      },
      "source": [
        "## Visualizing the Dataset"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Mba0rNYfdyjj"
      },
      "source": [
        "sns.countplot(x='label', data=train)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0ML4zRmpd2f5"
      },
      "source": [
        "Cleaning up"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "rN5O_Ok7Mm3u"
      },
      "source": [
        "del data, train, valid, train_l, train_o, valid_l, valid_o, o_class, l_class"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Kq8wnafrKlrF"
      },
      "source": [
        "---"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "pmon-FPTcqaK"
      },
      "source": [
        "# Importing Libraries"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "n3xA-UzIdGIE"
      },
      "source": [
        "!pip install pyprind\n",
        "import pyprind"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2gqqNqi7dCNI"
      },
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "\n",
        "import random\n",
        "import os\n",
        "import sys\n",
        "import gc\n",
        "\n",
        "\n",
        "import matplotlib.pyplot as plt \n",
        "import seaborn as sns\n",
        "%matplotlib inline"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "oza3RRKDi3Bs",
        "outputId": "78a12913-999a-40cb-8f57-7670b4a74448"
      },
      "source": [
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.nn.functional as F\n",
        "import torch.optim as optim\n",
        "from torch.nn.utils.rnn import pad_sequence\n",
        "import torchtext\n",
        "import spacy\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Collecting pyprind\n",
            "  Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)\n",
            "Installing collected packages: pyprind\n",
            "Successfully installed pyprind-2.11.3\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CpsoiDMpcuJu"
      },
      "source": [
        "PATH = '/content/'\n",
        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_DtJxQtCcuq1"
      },
      "source": [
        "# Dataloader"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kMxhB0LQltSK"
      },
      "source": [
        "class CreateDataset(torch.utils.data.Dataset):\n",
        "\n",
        "    def __init__(self, PATH, batch_size=32, mode='train'):\n",
        "        self.PATH = PATH\n",
        "        self.mode = mode + \".csv\"\n",
        "        self.batch_size = batch_size\n",
        "        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
        "\n",
        "        self.spacy = spacy.load(\"en_core_web_sm\")\n",
        "\n",
        "        self.TEXT = torchtext.legacy.data.Field(sequential=True, tokenize=\"spacy\")\n",
        "        self.LABEL = torchtext.legacy.data.LabelField(dtype=torch.long, sequential=False)\n",
        "\n",
        "        self.initData()\n",
        "        self.initEmbed()\n",
        "\n",
        "        self.makeData()\n",
        "\n",
        "    def initData(self):\n",
        "        DATA = os.path.join(self.PATH, 'inputs/')\n",
        "\n",
        "        self.data = torchtext.legacy.data.TabularDataset(\n",
        "                        path=os.path.join(DATA, self.mode), \n",
        "                        format=\"csv\", \n",
        "                        skip_header=True, \n",
        "                        fields=[('Text', self.TEXT), ('Label', self.LABEL)])\n",
        "\n",
        "    def initEmbed(self):\n",
        "        EMBED = os.path.join(self.PATH, \"embeddings/glove.840B.300d/glove.840B.300d.txt\")\n",
        "\n",
        "        self.TEXT.build_vocab(self.data,\n",
        "                         vectors=torchtext.vocab.Vectors(EMBED), \n",
        "                         max_size=25000,\n",
        "                         min_freq=10)\n",
        "        self.LABEL.build_vocab(self.data)\n",
        "\n",
        "    def makeData(self):\n",
        "        self.iterator = torchtext.legacy.data.Iterator(\n",
        "                        self.data, \n",
        "                        sort_key=lambda x: len(x.Text), \n",
        "                        batch_size=self.batch_size,\n",
        "                        device=self.device)\n",
        "\n",
        "    def lengthData(self):\n",
        "        return len(self.data)\n",
        "    \n",
        "    def lengthVocab(self):\n",
        "        return len(self.TEXT.vocab), len(self.LABEL.vocab)\n",
        "\n",
        "    def freqLABEL(self):\n",
        "        return self.LABEL.vocab.freqs\n",
        "\n",
        "    def getData(self):\n",
        "        return self.iterator\n",
        "\n",
        "    def getEmbeddings(self):\n",
        "        return self.TEXT.vocab.vectors"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "QCx2k_MKk_J8",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "de075519-3761-425a-e2eb-8c9a07367862"
      },
      "source": [
        "train_data = CreateDataset(\"/content/\", batch_size=16, mode='train')\n",
        "valid_data = CreateDataset(\"/content/\", batch_size=16, mode='valid')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "100%|█████████▉| 2195783/2196017 [03:50<00:00, 10347.64it/s]"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "aC9axW4MlnDo"
      },
      "source": [
        "trainloader = train_data.getData()\n",
        "valloader = valid_data.getData()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iRJQ_VUVqGtX"
      },
      "source": [
        "# Model Architecture"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ig5lFvb5qBVg"
      },
      "source": [
        "class LSTM(torch.nn.Module):\n",
        "    def __init__(self, input_dim, embedding_dim, num_layers, hidden_dim, static=False, dropout=0.2):\n",
        "        super(LSTM, self).__init__()\n",
        "        self.hidden_dim = hidden_dim\n",
        "\n",
        "        self.dropout = torch.nn.Dropout(p=dropout)\n",
        "\n",
        "        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)\n",
        "        if static:\n",
        "            self.embedding.weight.requires_grad = False\n",
        "\n",
        "        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, \n",
        "                                         num_layers=num_layers,\n",
        "                                         bidirectional=True, \n",
        "                                         dropout=dropout, \n",
        "                                         batch_first=True)\n",
        "        self.linear = torch.nn.Linear(hidden_dim*num_layers*2, 1)\n",
        "    \n",
        "    def forward(self, text):\n",
        "        embedded = self.embedding(text)\n",
        "        embedded = torch.transpose(embedded, dim0=1, dim1=0)\n",
        "        lstm_out, (hidden, cell) = self.lstm(embedded)\n",
        "        out = self.linear(self.dropout(torch.cat([cell[i,:, :] for i in range(cell.shape[0])], dim=1)))\n",
        "        return out"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UWOCuOTEdemZ"
      },
      "source": [
        "## Initializing the Model"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vgxiNTQZqvph"
      },
      "source": [
        "pretrained_embeddings = train_data.getEmbeddings()\n",
        "input_dim = train_data.lengthVocab()[0]\n",
        "embedding_dim = 300\n",
        "hidden_dim = 384\n",
        "output_dim = 2\n",
        "num_layers = 2\n",
        "batch_size = 16"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wQEKYLF8H46V"
      },
      "source": [
        "model = LSTM(input_dim, embedding_dim, hidden_dim, num_layers)\n",
        "model.embedding.weight.data = pretrained_embeddings.to(device)\n",
        "class_weights = torch.tensor([1.0, 15.0]).to(device)\n",
        "model = model.to(device)\n",
        "pass"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JCrmOyb2H90Y"
      },
      "source": [
        "optimizer = optim.SGD(model.parameters(), lr=1e-4)\n",
        "criterion = nn.BCEWithLogitsLoss().to(device)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gBkjuh3eM5E-"
      },
      "source": [
        "start_epochs = 0\n",
        "total_epochs = 16"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "dXeT0tezMe2v"
      },
      "source": [
        "CHECKPOINT = \"/content/drive/MyDrive/Projects/Hackathons/FakeNews-Team_Hackers/checkpoints/LSTM\"\n",
        "\n",
        "if os.path.exists(os.path.join(CHECKPOINT, \"model.pth\")):\n",
        "    checkpoints = torch.load(os.path.join(CHECKPOINT, \"model.pth\"))\n",
        "\n",
        "    model.load_state_dict(checkpoints['model_state_dict'])\n",
        "    optimizer.load_state_dict(checkpoints['optimizer_state_dict'])\n",
        "    start_epochs = checkpoints['epoch']"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "A5_rTjpwdanr"
      },
      "source": [
        "## Utility Functions"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5oIMgH-lYepm"
      },
      "source": [
        "def binary_accuracy(preds, y):\n",
        "\n",
        "    preds = torch.sigmoid(preds)\n",
        "    preds = torch.round(preds)\n",
        "\n",
        "    correct = (preds == y).float()\n",
        "    acc = correct.sum()/float(len(correct))\n",
        "    return acc"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NBu2_UHfrnQx"
      },
      "source": [
        "epoch_train_losses = []\n",
        "accu_train_epoch = []\n",
        "epoch_val_losses = []\n",
        "accu_val_epoch = []"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "uE7z6AGeHiMt"
      },
      "source": [
        "def train(model, iterator, optimizer, criterion):\n",
        "    \n",
        "    train_loss_batch = []\n",
        "    accu_train_batch = []\n",
        "    model.train()\n",
        "\n",
        "    gc.collect()\n",
        "    torch.cuda.empty_cache()\n",
        "\n",
        "    bar = pyprind.ProgBar(len(iterator), bar_char='█')\n",
        "    for idx, batch in enumerate(iterator, 1):\n",
        "        optimizer.zero_grad()\n",
        "                \n",
        "        predictions = model.forward(batch.Text).view(-1)\n",
        "        batch.Label = (batch.Label).type_as(predictions)\n",
        "        train_loss = criterion(predictions, batch.Label)\n",
        "        acc = binary_accuracy(predictions, batch.Label)\n",
        "        \n",
        "        train_loss.backward()\n",
        "        optimizer.step()\n",
        "        \n",
        "        train_loss_batch.append(train_loss.item())\n",
        "        accu_train_batch.append(acc)\n",
        "        bar.update()\n",
        "        gc.collect()\n",
        "        torch.cuda.empty_cache()\n",
        "\n",
        "    epoch_train_losses.append(sum(train_loss_batch)/len(iterator))\n",
        "    accu_train_epoch.append(sum(accu_train_batch)/len(iterator))\n",
        "\n",
        "    return epoch_train_losses[-1], accu_train_epoch[-1]"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "S8PzsM3dHj3Y"
      },
      "source": [
        "def evaluate(model, iterator, criterion):\n",
        "    \n",
        "    val_loss_batch = []\n",
        "    accu_val_batch = []\n",
        "    model.eval()\n",
        "\n",
        "    gc.collect()\n",
        "    torch.cuda.empty_cache()\n",
        "    \n",
        "    with torch.no_grad():\n",
        "        bar = pyprind.ProgBar(len(iterator), bar_char='█')\n",
        "        for idx, batch in enumerate(iterator, 1):\n",
        "\n",
        "            predictions = model.forward(batch.Text).view(-1)\n",
        "            batch.Label = (batch.Label).type_as(predictions)\n",
        "            val_loss = criterion(predictions, batch.Label)\n",
        "            \n",
        "            acc = binary_accuracy(predictions, batch.Label)\n",
        "\n",
        "            val_loss_batch.append(val_loss.item())\n",
        "            accu_val_batch.append(acc)\n",
        "            bar.update()\n",
        "            gc.collect()\n",
        "            torch.cuda.empty_cache()\n",
        "            \n",
        "        epoch_val_losses.append(sum(val_loss_batch)/len(iterator))\n",
        "        accu_val_epoch.append(sum(accu_val_batch)/len(iterator))\n",
        "    return epoch_val_losses[-1], accu_val_epoch[-1]"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "D6-ucY1adkIM"
      },
      "source": [
        "# Training Phase"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kgxdrB8nHp72"
      },
      "source": [
        "for epoch in range(start_epochs+1, total_epochs+start_epochs+1):\n",
        "\n",
        "    train_loss, train_acc = train(model, trainloader, optimizer, criterion)\n",
        "    valid_loss, valid_acc = evaluate(model, valloader, criterion)\n",
        "\n",
        "    torch.save({\n",
        "            'epoch': epoch,\n",
        "            'model_state_dict': model.state_dict(),\n",
        "            'optimizer_state_dict': optimizer.state_dict(),\n",
        "            'loss': epoch_train_losses[-1],\n",
        "            }, os.path.join(CHECKPOINT, \"model.pth\"))\n",
        "    \n",
        "    print(f'| Epoch: [{epoch:02}/{total_epochs+start_epochs+1}] | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}

## FakeNewsDetectionUsingLSTM_tensorflow.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              FakeNewsDetectionUsingLSTM_tensorflow.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "FakeNewsDetectionUsingLSTM-pytorch.ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"machine_shape": "hm"
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "49K4Q0GuMbYG",
	"outputId": "b3f661d4-83d0-4397-adef-dd5973f13326"
	},
	"source": [
	"from google.colab import drive\n",
	"drive.mount('/content/drive')"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Mounted at /content/drive\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "bMJchN9xckiW"
	},
	"source": [
	"# Downloading the dependencies\n",
	"\n",
	"- Downloading the dataset from kaggle using the kaggle API\n",
	"- Downloading pretrained GloVe embeddings"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "lYIkzQE_JGi0"
	},
	"source": [
	"from IPython.display import clear_output\n",
	"\n",
	"!pip install kaggle\n",
	"%env KAGGLE_USERNAME=xerefic\n",
	"%env KAGGLE_KEY=83aac7088c3bb8150fcf8197ab22c67b\n",
	"\n",
	"!kaggle competitions download -c fake-news\n",
	"!unzip /content/train.csv.zip\n",
	"!unzip /content/test.csv.zip\n",
	"!rm *.zip\n",
	"\n",
	"clear_output()"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "GG2qBprzMr5P"
	},
	"source": [
	"!wget https://nlp.stanford.edu/data/glove.840B.300d.zip\n",
	"!mkdir embeddings \n",
	"!mkdir embeddings/glove.840B.300d\n",
	"!unzip /content/glove.840B.300d.zip -d \"/content/embeddings/glove.840B.300d\"\n",
	"\n",
	"clear_output()"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "NaVaT-qXKmbj"
	},
	"source": [
	"---"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "W70DcfG2cxwt"
	},
	"source": [
	"# Processing the Dataset\n",
	"\n",
	"Concatenating the title and text to increase the learning scope of our model."
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "FRUnqWCLKkVo"
	},
	"source": [
	"import pandas as pd\n",
	"import os"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 221
	},
	"id": "uhaxPOWVKnLj",
	"outputId": "bc79ec81-8c45-470c-ae8b-99f19a3d9b35"
	},
	"source": [
	"data = pd.read_csv(\"/content/train.csv\")\n",
	"data = data.drop(columns=[\"id\", \"title\", \"author\"])"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"20800\n"
	],
	"name": "stdout"
	},
	{
	"output_type": "execute_result",
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>text</th>\n",
	" <th>label</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>House Dem Aide: We Didn’t Even See Comey’s Let...</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>Ever get the feeling your life circles the rou...</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>Why the Truth Might Get You Fired October 29, ...</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>Videos 15 Civilians Killed In Single US Airstr...</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>Print \\nAn Iranian woman has been sentenced to...</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" text label\n",
	"0 House Dem Aide: We Didn’t Even See Comey’s Let... 1\n",
	"1 Ever get the feeling your life circles the rou... 0\n",
	"2 Why the Truth Might Get You Fired October 29, ... 1\n",
	"3 Videos 15 Civilians Killed In Single US Airstr... 1\n",
	"4 Print \\nAn Iranian woman has been sentenced to... 1"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 7
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "mYD_-B2ELjrB",
	"outputId": "e2f208ef-7e3d-437b-d251-c2b813606873"
	},
	"source": [
	"o_class = data.loc[data.label == 0, :]\n",
	"l_class = data.loc[data.label == 1, :]\n",
	"print(len(o_class))\n",
	"print(len(l_class))"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"10387\n",
	"10413\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "0Ka9chiqL-OI"
	},
	"source": [
	"valid_o = o_class.iloc[:1000, :]\n",
	"valid_l = l_class.iloc[:1000, :]\n",
	"\n",
	"train_o = o_class.iloc[1000:, :]\n",
	"train_l = l_class.iloc[1000:, :]"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "m42GmaOmMJrA",
	"outputId": "87a7621e-81ce-4883-9b26-a796781ffc7d"
	},
	"source": [
	"train = pd.concat([train_o, train_l], axis=0)\n",
	"print(train.shape)\n",
	"\n",
	"valid = pd.concat([valid_o, valid_l], axis=0)\n",
	"print(valid.shape)"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"(18800, 2)\n",
	"(2000, 2)\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "m0LMbhozMNav"
	},
	"source": [
	"!mkdir inputs\n",
	"\n",
	"train.to_csv(\"/content/inputs/train.csv\", index=False)\n",
	"valid.to_csv(\"/content/inputs/valid.csv\", index=False)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "FOSzTY8HdzEh"
	},
	"source": [
	"## Visualizing the Dataset"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "Mba0rNYfdyjj"
	},
	"source": [
	"sns.countplot(x='label', data=train)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "0ML4zRmpd2f5"
	},
	"source": [
	"Cleaning up"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "rN5O_Ok7Mm3u"
	},
	"source": [
	"del data, train, valid, train_l, train_o, valid_l, valid_o, o_class, l_class"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "Kq8wnafrKlrF"
	},
	"source": [
	"---"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "pmon-FPTcqaK"
	},
	"source": [
	"# Importing Libraries"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "n3xA-UzIdGIE"
	},
	"source": [
	"!pip install pyprind\n",
	"import pyprind"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "2gqqNqi7dCNI"
	},
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"\n",
	"import random\n",
	"import os\n",
	"import sys\n",
	"import gc\n",
	"\n",
	"\n",
	"import matplotlib.pyplot as plt \n",
	"import seaborn as sns\n",
	"%matplotlib inline"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "oza3RRKDi3Bs",
	"outputId": "78a12913-999a-40cb-8f57-7670b4a74448"
	},
	"source": [
	"import torch\n",
	"import torch.nn as nn\n",
	"import torch.nn.functional as F\n",
	"import torch.optim as optim\n",
	"from torch.nn.utils.rnn import pad_sequence\n",
	"import torchtext\n",
	"import spacy\n"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Collecting pyprind\n",
	" Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)\n",
	"Installing collected packages: pyprind\n",
	"Successfully installed pyprind-2.11.3\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "CpsoiDMpcuJu"
	},
	"source": [
	"PATH = '/content/'\n",
	"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "_DtJxQtCcuq1"
	},
	"source": [
	"# Dataloader"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "kMxhB0LQltSK"
	},
	"source": [
	"class CreateDataset(torch.utils.data.Dataset):\n",
	"\n",
	" def __init__(self, PATH, batch_size=32, mode='train'):\n",
	" self.PATH = PATH\n",
	" self.mode = mode + \".csv\"\n",
	" self.batch_size = batch_size\n",
	" self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
	"\n",
	" self.spacy = spacy.load(\"en_core_web_sm\")\n",
	"\n",
	" self.TEXT = torchtext.legacy.data.Field(sequential=True, tokenize=\"spacy\")\n",
	" self.LABEL = torchtext.legacy.data.LabelField(dtype=torch.long, sequential=False)\n",
	"\n",
	" self.initData()\n",
	" self.initEmbed()\n",
	"\n",
	" self.makeData()\n",
	"\n",
	" def initData(self):\n",
	" DATA = os.path.join(self.PATH, 'inputs/')\n",
	"\n",
	" self.data = torchtext.legacy.data.TabularDataset(\n",
	" path=os.path.join(DATA, self.mode), \n",
	" format=\"csv\", \n",
	" skip_header=True, \n",
	" fields=[('Text', self.TEXT), ('Label', self.LABEL)])\n",
	"\n",
	" def initEmbed(self):\n",
	" EMBED = os.path.join(self.PATH, \"embeddings/glove.840B.300d/glove.840B.300d.txt\")\n",
	"\n",
	" self.TEXT.build_vocab(self.data,\n",
	" vectors=torchtext.vocab.Vectors(EMBED), \n",
	" max_size=25000,\n",
	" min_freq=10)\n",
	" self.LABEL.build_vocab(self.data)\n",
	"\n",
	" def makeData(self):\n",
	" self.iterator = torchtext.legacy.data.Iterator(\n",
	" self.data, \n",
	" sort_key=lambda x: len(x.Text), \n",
	" batch_size=self.batch_size,\n",
	" device=self.device)\n",
	"\n",
	" def lengthData(self):\n",
	" return len(self.data)\n",
	" \n",
	" def lengthVocab(self):\n",
	" return len(self.TEXT.vocab), len(self.LABEL.vocab)\n",
	"\n",
	" def freqLABEL(self):\n",
	" return self.LABEL.vocab.freqs\n",
	"\n",
	" def getData(self):\n",
	" return self.iterator\n",
	"\n",
	" def getEmbeddings(self):\n",
	" return self.TEXT.vocab.vectors"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "QCx2k_MKk_J8",
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"outputId": "de075519-3761-425a-e2eb-8c9a07367862"
	},
	"source": [
	"train_data = CreateDataset(\"/content/\", batch_size=16, mode='train')\n",
	"valid_data = CreateDataset(\"/content/\", batch_size=16, mode='valid')"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"100%\|█████████▉\| 2195783/2196017 [03:50<00:00, 10347.64it/s]"
	],
	"name": "stderr"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "aC9axW4MlnDo"
	},
	"source": [
	"trainloader = train_data.getData()\n",
	"valloader = valid_data.getData()"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "iRJQ_VUVqGtX"
	},
	"source": [
	"# Model Architecture"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "ig5lFvb5qBVg"
	},
	"source": [
	"class LSTM(torch.nn.Module):\n",
	" def __init__(self, input_dim, embedding_dim, num_layers, hidden_dim, static=False, dropout=0.2):\n",
	" super(LSTM, self).__init__()\n",
	" self.hidden_dim = hidden_dim\n",
	"\n",
	" self.dropout = torch.nn.Dropout(p=dropout)\n",
	"\n",
	" self.embedding = torch.nn.Embedding(input_dim, embedding_dim)\n",
	" if static:\n",
	" self.embedding.weight.requires_grad = False\n",
	"\n",
	" self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, \n",
	" num_layers=num_layers,\n",
	" bidirectional=True, \n",
	" dropout=dropout, \n",
	" batch_first=True)\n",
	" self.linear = torch.nn.Linear(hidden_dimnum_layers2, 1)\n",
	" \n",
	" def forward(self, text):\n",
	" embedded = self.embedding(text)\n",
	" embedded = torch.transpose(embedded, dim0=1, dim1=0)\n",
	" lstm_out, (hidden, cell) = self.lstm(embedded)\n",
	" out = self.linear(self.dropout(torch.cat([cell[i,:, :] for i in range(cell.shape[0])], dim=1)))\n",
	" return out"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "UWOCuOTEdemZ"
	},
	"source": [
	"## Initializing the Model"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "vgxiNTQZqvph"
	},
	"source": [
	"pretrained_embeddings = train_data.getEmbeddings()\n",
	"input_dim = train_data.lengthVocab()[0]\n",
	"embedding_dim = 300\n",
	"hidden_dim = 384\n",
	"output_dim = 2\n",
	"num_layers = 2\n",
	"batch_size = 16"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "wQEKYLF8H46V"
	},
	"source": [
	"model = LSTM(input_dim, embedding_dim, hidden_dim, num_layers)\n",
	"model.embedding.weight.data = pretrained_embeddings.to(device)\n",
	"class_weights = torch.tensor([1.0, 15.0]).to(device)\n",
	"model = model.to(device)\n",
	"pass"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "JCrmOyb2H90Y"
	},
	"source": [
	"optimizer = optim.SGD(model.parameters(), lr=1e-4)\n",
	"criterion = nn.BCEWithLogitsLoss().to(device)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "gBkjuh3eM5E-"
	},
	"source": [
	"start_epochs = 0\n",
	"total_epochs = 16"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "dXeT0tezMe2v"
	},
	"source": [
	"CHECKPOINT = \"/content/drive/MyDrive/Projects/Hackathons/FakeNews-Team_Hackers/checkpoints/LSTM\"\n",
	"\n",
	"if os.path.exists(os.path.join(CHECKPOINT, \"model.pth\")):\n",
	" checkpoints = torch.load(os.path.join(CHECKPOINT, \"model.pth\"))\n",
	"\n",
	" model.load_state_dict(checkpoints['model_state_dict'])\n",
	" optimizer.load_state_dict(checkpoints['optimizer_state_dict'])\n",
	" start_epochs = checkpoints['epoch']"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "A5_rTjpwdanr"
	},
	"source": [
	"## Utility Functions"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "5oIMgH-lYepm"
	},
	"source": [
	"def binary_accuracy(preds, y):\n",
	"\n",
	" preds = torch.sigmoid(preds)\n",
	" preds = torch.round(preds)\n",
	"\n",
	" correct = (preds == y).float()\n",
	" acc = correct.sum()/float(len(correct))\n",
	" return acc"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "NBu2_UHfrnQx"
	},
	"source": [
	"epoch_train_losses = []\n",
	"accu_train_epoch = []\n",
	"epoch_val_losses = []\n",
	"accu_val_epoch = []"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "uE7z6AGeHiMt"
	},
	"source": [
	"def train(model, iterator, optimizer, criterion):\n",
	" \n",
	" train_loss_batch = []\n",
	" accu_train_batch = []\n",
	" model.train()\n",
	"\n",
	" gc.collect()\n",
	" torch.cuda.empty_cache()\n",
	"\n",
	" bar = pyprind.ProgBar(len(iterator), bar_char='█')\n",
	" for idx, batch in enumerate(iterator, 1):\n",
	" optimizer.zero_grad()\n",
	" \n",
	" predictions = model.forward(batch.Text).view(-1)\n",
	" batch.Label = (batch.Label).type_as(predictions)\n",
	" train_loss = criterion(predictions, batch.Label)\n",
	" acc = binary_accuracy(predictions, batch.Label)\n",
	" \n",
	" train_loss.backward()\n",
	" optimizer.step()\n",
	" \n",
	" train_loss_batch.append(train_loss.item())\n",
	" accu_train_batch.append(acc)\n",
	" bar.update()\n",
	" gc.collect()\n",
	" torch.cuda.empty_cache()\n",
	"\n",
	" epoch_train_losses.append(sum(train_loss_batch)/len(iterator))\n",
	" accu_train_epoch.append(sum(accu_train_batch)/len(iterator))\n",
	"\n",
	" return epoch_train_losses[-1], accu_train_epoch[-1]"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "S8PzsM3dHj3Y"
	},
	"source": [
	"def evaluate(model, iterator, criterion):\n",
	" \n",
	" val_loss_batch = []\n",
	" accu_val_batch = []\n",
	" model.eval()\n",
	"\n",
	" gc.collect()\n",
	" torch.cuda.empty_cache()\n",
	" \n",
	" with torch.no_grad():\n",
	" bar = pyprind.ProgBar(len(iterator), bar_char='█')\n",
	" for idx, batch in enumerate(iterator, 1):\n",
	"\n",
	" predictions = model.forward(batch.Text).view(-1)\n",
	" batch.Label = (batch.Label).type_as(predictions)\n",
	" val_loss = criterion(predictions, batch.Label)\n",
	" \n",
	" acc = binary_accuracy(predictions, batch.Label)\n",
	"\n",
	" val_loss_batch.append(val_loss.item())\n",
	" accu_val_batch.append(acc)\n",
	" bar.update()\n",
	" gc.collect()\n",
	" torch.cuda.empty_cache()\n",
	" \n",
	" epoch_val_losses.append(sum(val_loss_batch)/len(iterator))\n",
	" accu_val_epoch.append(sum(accu_val_batch)/len(iterator))\n",
	" return epoch_val_losses[-1], accu_val_epoch[-1]"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "D6-ucY1adkIM"
	},
	"source": [
	"# Training Phase"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "kgxdrB8nHp72"
	},
	"source": [
	"for epoch in range(start_epochs+1, total_epochs+start_epochs+1):\n",
	"\n",
	" train_loss, train_acc = train(model, trainloader, optimizer, criterion)\n",
	" valid_loss, valid_acc = evaluate(model, valloader, criterion)\n",
	"\n",
	" torch.save({\n",
	" 'epoch': epoch,\n",
	" 'model_state_dict': model.state_dict(),\n",
	" 'optimizer_state_dict': optimizer.state_dict(),\n",
	" 'loss': epoch_train_losses[-1],\n",
	" }, os.path.join(CHECKPOINT, \"model.pth\"))\n",
	" \n",
	" print(f'\| Epoch: [{epoch:02}/{total_epochs+start_epochs+1}] \| Train Loss: {train_loss:.3f} \| Train Acc: {train_acc100:.2f}% \| Val. Loss: {valid_loss:.3f} \| Val. Acc: {valid_acc100:.2f}% \|')"
	],
	"execution_count": null,
	"outputs": []
	}
	]
	}