Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save sudip-mondal-2002/67c735d0e1cbd768b698e47dd1404f58 to your computer and use it in GitHub Desktop.
Save sudip-mondal-2002/67c735d0e1cbd768b698e47dd1404f58 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "FakeNewsDetectionUsingLSTM-pytorch.ipynb",
"provenance": [],
"collapsed_sections": [],
"machine_shape": "hm"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "49K4Q0GuMbYG",
"outputId": "b3f661d4-83d0-4397-adef-dd5973f13326"
},
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive')"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Mounted at /content/drive\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bMJchN9xckiW"
},
"source": [
"# Downloading the dependencies\n",
"\n",
"- Downloading the dataset from kaggle using the kaggle API\n",
"- Downloading pretrained GloVe embeddings"
]
},
{
"cell_type": "code",
"metadata": {
"id": "lYIkzQE_JGi0"
},
"source": [
"from IPython.display import clear_output\n",
"\n",
"!pip install kaggle\n",
"%env KAGGLE_USERNAME=xerefic\n",
"%env KAGGLE_KEY=83aac7088c3bb8150fcf8197ab22c67b\n",
"\n",
"!kaggle competitions download -c fake-news\n",
"!unzip /content/train.csv.zip\n",
"!unzip /content/test.csv.zip\n",
"!rm *.zip\n",
"\n",
"clear_output()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "GG2qBprzMr5P"
},
"source": [
"!wget https://nlp.stanford.edu/data/glove.840B.300d.zip\n",
"!mkdir embeddings \n",
"!mkdir embeddings/glove.840B.300d\n",
"!unzip /content/glove.840B.300d.zip -d \"/content/embeddings/glove.840B.300d\"\n",
"\n",
"clear_output()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "NaVaT-qXKmbj"
},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "W70DcfG2cxwt"
},
"source": [
"# Processing the Dataset\n",
"\n",
"Concatenating the title and text to increase the learning scope of our model."
]
},
{
"cell_type": "code",
"metadata": {
"id": "FRUnqWCLKkVo"
},
"source": [
"import pandas as pd\n",
"import os"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 221
},
"id": "uhaxPOWVKnLj",
"outputId": "bc79ec81-8c45-470c-ae8b-99f19a3d9b35"
},
"source": [
"data = pd.read_csv(\"/content/train.csv\")\n",
"data = data.drop(columns=[\"id\", \"title\", \"author\"])"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"20800\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>House Dem Aide: We Didn’t Even See Comey’s Let...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Ever get the feeling your life circles the rou...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Why the Truth Might Get You Fired October 29, ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Videos 15 Civilians Killed In Single US Airstr...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Print \\nAn Iranian woman has been sentenced to...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 House Dem Aide: We Didn’t Even See Comey’s Let... 1\n",
"1 Ever get the feeling your life circles the rou... 0\n",
"2 Why the Truth Might Get You Fired October 29, ... 1\n",
"3 Videos 15 Civilians Killed In Single US Airstr... 1\n",
"4 Print \\nAn Iranian woman has been sentenced to... 1"
]
},
"metadata": {
"tags": []
},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mYD_-B2ELjrB",
"outputId": "e2f208ef-7e3d-437b-d251-c2b813606873"
},
"source": [
"o_class = data.loc[data.label == 0, :]\n",
"l_class = data.loc[data.label == 1, :]\n",
"print(len(o_class))\n",
"print(len(l_class))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"10387\n",
"10413\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "0Ka9chiqL-OI"
},
"source": [
"valid_o = o_class.iloc[:1000, :]\n",
"valid_l = l_class.iloc[:1000, :]\n",
"\n",
"train_o = o_class.iloc[1000:, :]\n",
"train_l = l_class.iloc[1000:, :]"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "m42GmaOmMJrA",
"outputId": "87a7621e-81ce-4883-9b26-a796781ffc7d"
},
"source": [
"train = pd.concat([train_o, train_l], axis=0)\n",
"print(train.shape)\n",
"\n",
"valid = pd.concat([valid_o, valid_l], axis=0)\n",
"print(valid.shape)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"(18800, 2)\n",
"(2000, 2)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "m0LMbhozMNav"
},
"source": [
"!mkdir inputs\n",
"\n",
"train.to_csv(\"/content/inputs/train.csv\", index=False)\n",
"valid.to_csv(\"/content/inputs/valid.csv\", index=False)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "FOSzTY8HdzEh"
},
"source": [
"## Visualizing the Dataset"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Mba0rNYfdyjj"
},
"source": [
"sns.countplot(x='label', data=train)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "0ML4zRmpd2f5"
},
"source": [
"Cleaning up"
]
},
{
"cell_type": "code",
"metadata": {
"id": "rN5O_Ok7Mm3u"
},
"source": [
"del data, train, valid, train_l, train_o, valid_l, valid_o, o_class, l_class"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "Kq8wnafrKlrF"
},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pmon-FPTcqaK"
},
"source": [
"# Importing Libraries"
]
},
{
"cell_type": "code",
"metadata": {
"id": "n3xA-UzIdGIE"
},
"source": [
"!pip install pyprind\n",
"import pyprind"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "2gqqNqi7dCNI"
},
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"import random\n",
"import os\n",
"import sys\n",
"import gc\n",
"\n",
"\n",
"import matplotlib.pyplot as plt \n",
"import seaborn as sns\n",
"%matplotlib inline"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oza3RRKDi3Bs",
"outputId": "78a12913-999a-40cb-8f57-7670b4a74448"
},
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"import torch.optim as optim\n",
"from torch.nn.utils.rnn import pad_sequence\n",
"import torchtext\n",
"import spacy\n"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Collecting pyprind\n",
" Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)\n",
"Installing collected packages: pyprind\n",
"Successfully installed pyprind-2.11.3\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "CpsoiDMpcuJu"
},
"source": [
"PATH = '/content/'\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "_DtJxQtCcuq1"
},
"source": [
"# Dataloader"
]
},
{
"cell_type": "code",
"metadata": {
"id": "kMxhB0LQltSK"
},
"source": [
"class CreateDataset(torch.utils.data.Dataset):\n",
"\n",
" def __init__(self, PATH, batch_size=32, mode='train'):\n",
" self.PATH = PATH\n",
" self.mode = mode + \".csv\"\n",
" self.batch_size = batch_size\n",
" self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"\n",
" self.spacy = spacy.load(\"en_core_web_sm\")\n",
"\n",
" self.TEXT = torchtext.legacy.data.Field(sequential=True, tokenize=\"spacy\")\n",
" self.LABEL = torchtext.legacy.data.LabelField(dtype=torch.long, sequential=False)\n",
"\n",
" self.initData()\n",
" self.initEmbed()\n",
"\n",
" self.makeData()\n",
"\n",
" def initData(self):\n",
" DATA = os.path.join(self.PATH, 'inputs/')\n",
"\n",
" self.data = torchtext.legacy.data.TabularDataset(\n",
" path=os.path.join(DATA, self.mode), \n",
" format=\"csv\", \n",
" skip_header=True, \n",
" fields=[('Text', self.TEXT), ('Label', self.LABEL)])\n",
"\n",
" def initEmbed(self):\n",
" EMBED = os.path.join(self.PATH, \"embeddings/glove.840B.300d/glove.840B.300d.txt\")\n",
"\n",
" self.TEXT.build_vocab(self.data,\n",
" vectors=torchtext.vocab.Vectors(EMBED), \n",
" max_size=25000,\n",
" min_freq=10)\n",
" self.LABEL.build_vocab(self.data)\n",
"\n",
" def makeData(self):\n",
" self.iterator = torchtext.legacy.data.Iterator(\n",
" self.data, \n",
" sort_key=lambda x: len(x.Text), \n",
" batch_size=self.batch_size,\n",
" device=self.device)\n",
"\n",
" def lengthData(self):\n",
" return len(self.data)\n",
" \n",
" def lengthVocab(self):\n",
" return len(self.TEXT.vocab), len(self.LABEL.vocab)\n",
"\n",
" def freqLABEL(self):\n",
" return self.LABEL.vocab.freqs\n",
"\n",
" def getData(self):\n",
" return self.iterator\n",
"\n",
" def getEmbeddings(self):\n",
" return self.TEXT.vocab.vectors"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "QCx2k_MKk_J8",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "de075519-3761-425a-e2eb-8c9a07367862"
},
"source": [
"train_data = CreateDataset(\"/content/\", batch_size=16, mode='train')\n",
"valid_data = CreateDataset(\"/content/\", batch_size=16, mode='valid')"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"100%|█████████▉| 2195783/2196017 [03:50<00:00, 10347.64it/s]"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "aC9axW4MlnDo"
},
"source": [
"trainloader = train_data.getData()\n",
"valloader = valid_data.getData()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "iRJQ_VUVqGtX"
},
"source": [
"# Model Architecture"
]
},
{
"cell_type": "code",
"metadata": {
"id": "ig5lFvb5qBVg"
},
"source": [
"class LSTM(torch.nn.Module):\n",
" def __init__(self, input_dim, embedding_dim, num_layers, hidden_dim, static=False, dropout=0.2):\n",
" super(LSTM, self).__init__()\n",
" self.hidden_dim = hidden_dim\n",
"\n",
" self.dropout = torch.nn.Dropout(p=dropout)\n",
"\n",
" self.embedding = torch.nn.Embedding(input_dim, embedding_dim)\n",
" if static:\n",
" self.embedding.weight.requires_grad = False\n",
"\n",
" self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, \n",
" num_layers=num_layers,\n",
" bidirectional=True, \n",
" dropout=dropout, \n",
" batch_first=True)\n",
" self.linear = torch.nn.Linear(hidden_dim*num_layers*2, 1)\n",
" \n",
" def forward(self, text):\n",
" embedded = self.embedding(text)\n",
" embedded = torch.transpose(embedded, dim0=1, dim1=0)\n",
" lstm_out, (hidden, cell) = self.lstm(embedded)\n",
" out = self.linear(self.dropout(torch.cat([cell[i,:, :] for i in range(cell.shape[0])], dim=1)))\n",
" return out"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "UWOCuOTEdemZ"
},
"source": [
"## Initializing the Model"
]
},
{
"cell_type": "code",
"metadata": {
"id": "vgxiNTQZqvph"
},
"source": [
"pretrained_embeddings = train_data.getEmbeddings()\n",
"input_dim = train_data.lengthVocab()[0]\n",
"embedding_dim = 300\n",
"hidden_dim = 384\n",
"output_dim = 2\n",
"num_layers = 2\n",
"batch_size = 16"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "wQEKYLF8H46V"
},
"source": [
"model = LSTM(input_dim, embedding_dim, hidden_dim, num_layers)\n",
"model.embedding.weight.data = pretrained_embeddings.to(device)\n",
"class_weights = torch.tensor([1.0, 15.0]).to(device)\n",
"model = model.to(device)\n",
"pass"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "JCrmOyb2H90Y"
},
"source": [
"optimizer = optim.SGD(model.parameters(), lr=1e-4)\n",
"criterion = nn.BCEWithLogitsLoss().to(device)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "gBkjuh3eM5E-"
},
"source": [
"start_epochs = 0\n",
"total_epochs = 16"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "dXeT0tezMe2v"
},
"source": [
"CHECKPOINT = \"/content/drive/MyDrive/Projects/Hackathons/FakeNews-Team_Hackers/checkpoints/LSTM\"\n",
"\n",
"if os.path.exists(os.path.join(CHECKPOINT, \"model.pth\")):\n",
" checkpoints = torch.load(os.path.join(CHECKPOINT, \"model.pth\"))\n",
"\n",
" model.load_state_dict(checkpoints['model_state_dict'])\n",
" optimizer.load_state_dict(checkpoints['optimizer_state_dict'])\n",
" start_epochs = checkpoints['epoch']"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "A5_rTjpwdanr"
},
"source": [
"## Utility Functions"
]
},
{
"cell_type": "code",
"metadata": {
"id": "5oIMgH-lYepm"
},
"source": [
"def binary_accuracy(preds, y):\n",
"\n",
" preds = torch.sigmoid(preds)\n",
" preds = torch.round(preds)\n",
"\n",
" correct = (preds == y).float()\n",
" acc = correct.sum()/float(len(correct))\n",
" return acc"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "NBu2_UHfrnQx"
},
"source": [
"epoch_train_losses = []\n",
"accu_train_epoch = []\n",
"epoch_val_losses = []\n",
"accu_val_epoch = []"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "uE7z6AGeHiMt"
},
"source": [
"def train(model, iterator, optimizer, criterion):\n",
" \n",
" train_loss_batch = []\n",
" accu_train_batch = []\n",
" model.train()\n",
"\n",
" gc.collect()\n",
" torch.cuda.empty_cache()\n",
"\n",
" bar = pyprind.ProgBar(len(iterator), bar_char='█')\n",
" for idx, batch in enumerate(iterator, 1):\n",
" optimizer.zero_grad()\n",
" \n",
" predictions = model.forward(batch.Text).view(-1)\n",
" batch.Label = (batch.Label).type_as(predictions)\n",
" train_loss = criterion(predictions, batch.Label)\n",
" acc = binary_accuracy(predictions, batch.Label)\n",
" \n",
" train_loss.backward()\n",
" optimizer.step()\n",
" \n",
" train_loss_batch.append(train_loss.item())\n",
" accu_train_batch.append(acc)\n",
" bar.update()\n",
" gc.collect()\n",
" torch.cuda.empty_cache()\n",
"\n",
" epoch_train_losses.append(sum(train_loss_batch)/len(iterator))\n",
" accu_train_epoch.append(sum(accu_train_batch)/len(iterator))\n",
"\n",
" return epoch_train_losses[-1], accu_train_epoch[-1]"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "S8PzsM3dHj3Y"
},
"source": [
"def evaluate(model, iterator, criterion):\n",
" \n",
" val_loss_batch = []\n",
" accu_val_batch = []\n",
" model.eval()\n",
"\n",
" gc.collect()\n",
" torch.cuda.empty_cache()\n",
" \n",
" with torch.no_grad():\n",
" bar = pyprind.ProgBar(len(iterator), bar_char='█')\n",
" for idx, batch in enumerate(iterator, 1):\n",
"\n",
" predictions = model.forward(batch.Text).view(-1)\n",
" batch.Label = (batch.Label).type_as(predictions)\n",
" val_loss = criterion(predictions, batch.Label)\n",
" \n",
" acc = binary_accuracy(predictions, batch.Label)\n",
"\n",
" val_loss_batch.append(val_loss.item())\n",
" accu_val_batch.append(acc)\n",
" bar.update()\n",
" gc.collect()\n",
" torch.cuda.empty_cache()\n",
" \n",
" epoch_val_losses.append(sum(val_loss_batch)/len(iterator))\n",
" accu_val_epoch.append(sum(accu_val_batch)/len(iterator))\n",
" return epoch_val_losses[-1], accu_val_epoch[-1]"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "D6-ucY1adkIM"
},
"source": [
"# Training Phase"
]
},
{
"cell_type": "code",
"metadata": {
"id": "kgxdrB8nHp72"
},
"source": [
"for epoch in range(start_epochs+1, total_epochs+start_epochs+1):\n",
"\n",
" train_loss, train_acc = train(model, trainloader, optimizer, criterion)\n",
" valid_loss, valid_acc = evaluate(model, valloader, criterion)\n",
"\n",
" torch.save({\n",
" 'epoch': epoch,\n",
" 'model_state_dict': model.state_dict(),\n",
" 'optimizer_state_dict': optimizer.state_dict(),\n",
" 'loss': epoch_train_losses[-1],\n",
" }, os.path.join(CHECKPOINT, \"model.pth\"))\n",
" \n",
" print(f'| Epoch: [{epoch:02}/{total_epochs+start_epochs+1}] | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')"
],
"execution_count": null,
"outputs": []
}
]
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment