r-sajal/bert-ktrain-for-github.ipynb

## bert-ktrain-for-github.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "bert-ktrain-for-github.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyP+QTJjlfT+n6GmZ54XuFAM",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/r-sajal/1b7ee74a4cf8279c5f5abf4fe601094d/bert-ktrain-for-github.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "75jk41NMybZH"
      },
      "source": [
        "# **Model 2 : Ktrain**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "43wVOY4gyX2r"
      },
      "source": [
        "# Insatalling ktrain \n",
        "# This may be a problem as pip changed some policies so I suggest to use Google Colab\n",
        "pip install ktrain"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_5sx-6jDy1WZ"
      },
      "source": [
        "# Importing Genearal Libraries\n",
        "\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import sklearn\n",
        "import re\n",
        "import string as s\n",
        "import warnings, gc\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "\n",
        "# Tensorflow\n",
        "import tensorflow as tf\n",
        "\n",
        "# ktrain\n",
        "import ktrain\n",
        "from ktrain import text\n",
        "\n",
        "# sklearn\n",
        "from sklearn.model_selection import train_test_split\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "09cGqDzzzLw5"
      },
      "source": [
        "# importing data\n",
        "# change read method according to dataset\n",
        "train_sm_df = pd.read_json(\"/content/drive/My Drive/train_extra.json\")\n",
        "test_df=pd.read_json(\"/content/drive/My Drive/embold_test.json\")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yKdkS8KezU_S"
      },
      "source": [
        "# splitting data\n",
        "target = ['label'] \n",
        "data = ['text']\n",
        "X = train_sm_df[data]\n",
        "y = train_sm_df[target]\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "EvjjZUFuzukl"
      },
      "source": [
        "# Transformer Model\n",
        "model_ = 'bert-base-uncased'\n",
        "t_mod = text.Transformer(model_, maxlen=100, classes = [0,1,2])\n",
        "\n",
        "\n",
        "'''Converting split data to list [so it can processed]'''\n",
        "#train\n",
        "X_tr = X_train['text'].tolist()\n",
        "y_tr = y_train['label'].tolist()\n",
        "\n",
        "#test\n",
        "X_ts = X_test['text'].tolist()\n",
        "y_ts = y_test['label'].tolist()\n",
        "\n",
        "\n",
        "# Pre-processing training & test data\n",
        "train = t_mod.preprocess_train(X_tr,y_tr)\n",
        "test = t_mod.preprocess_train(X_ts,y_ts)\n",
        "\n",
        "# Model Classifier\n",
        "model = t_mod.get_classifier()\n",
        "\n",
        "# increasing batch size helps in increasing speed of gpu as we can do parallel computation \n",
        "# increasing too much will also lead to poor results as it would be taking average of all the local optimal points\n",
        "# It may happen that local and optimal solution have high differences which will effect the avg value\n",
        "# decreasing batch size too much may also give bad results : because it may lead to convergence at local optimal minima \n",
        "# you have to tune this accordng to need\n",
        "learner = ktrain.get_learner(model, train_data=train, val_data=test, batch_size=8) # preferable batch size = 8,16,"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "YcM3UUiE92ID"
      },
      "source": [
        "# Model Train\n",
        "# as an output you will get summary will\n",
        "learner.fit_onecycle(learning_rate, epochs)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nhsWFWeSz-EP"
      },
      "source": [
        "# Prediction\n",
        "# pass The name of classes as you want \n",
        "classes = ['1', '2','3']\n",
        "predictor = ktrain.get_predictor(learner.model, preproc=t_mod)\n",
        "# pass the array or string as you like to test your data\n",
        "pred_class = predictor.predict(X_test['text'][67])\n",
        "print(\"Predicted Class: \", classes[pred_class])"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "bert-ktrain-for-github.ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"authorship_tag": "ABX9TyP+QTJjlfT+n6GmZ54XuFAM",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/r-sajal/1b7ee74a4cf8279c5f5abf4fe601094d/bert-ktrain-for-github.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "75jk41NMybZH"
	},
	"source": [
	"# Model 2 : Ktrain"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "43wVOY4gyX2r"
	},
	"source": [
	"# Insatalling ktrain \n",
	"# This may be a problem as pip changed some policies so I suggest to use Google Colab\n",
	"pip install ktrain"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "_5sx-6jDy1WZ"
	},
	"source": [
	"# Importing Genearal Libraries\n",
	"\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"import sklearn\n",
	"import re\n",
	"import string as s\n",
	"import warnings, gc\n",
	"warnings.filterwarnings(\"ignore\")\n",
	"\n",
	"# Tensorflow\n",
	"import tensorflow as tf\n",
	"\n",
	"# ktrain\n",
	"import ktrain\n",
	"from ktrain import text\n",
	"\n",
	"# sklearn\n",
	"from sklearn.model_selection import train_test_split\n"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "09cGqDzzzLw5"
	},
	"source": [
	"# importing data\n",
	"# change read method according to dataset\n",
	"train_sm_df = pd.read_json(\"/content/drive/My Drive/train_extra.json\")\n",
	"test_df=pd.read_json(\"/content/drive/My Drive/embold_test.json\")"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "yKdkS8KezU_S"
	},
	"source": [
	"# splitting data\n",
	"target = ['label'] \n",
	"data = ['text']\n",
	"X = train_sm_df[data]\n",
	"y = train_sm_df[target]\n",
	"X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "EvjjZUFuzukl"
	},
	"source": [
	"# Transformer Model\n",
	"model_ = 'bert-base-uncased'\n",
	"t_mod = text.Transformer(model_, maxlen=100, classes = [0,1,2])\n",
	"\n",
	"\n",
	"'''Converting split data to list [so it can processed]'''\n",
	"#train\n",
	"X_tr = X_train['text'].tolist()\n",
	"y_tr = y_train['label'].tolist()\n",
	"\n",
	"#test\n",
	"X_ts = X_test['text'].tolist()\n",
	"y_ts = y_test['label'].tolist()\n",
	"\n",
	"\n",
	"# Pre-processing training & test data\n",
	"train = t_mod.preprocess_train(X_tr,y_tr)\n",
	"test = t_mod.preprocess_train(X_ts,y_ts)\n",
	"\n",
	"# Model Classifier\n",
	"model = t_mod.get_classifier()\n",
	"\n",
	"# increasing batch size helps in increasing speed of gpu as we can do parallel computation \n",
	"# increasing too much will also lead to poor results as it would be taking average of all the local optimal points\n",
	"# It may happen that local and optimal solution have high differences which will effect the avg value\n",
	"# decreasing batch size too much may also give bad results : because it may lead to convergence at local optimal minima \n",
	"# you have to tune this accordng to need\n",
	"learner = ktrain.get_learner(model, train_data=train, val_data=test, batch_size=8) # preferable batch size = 8,16,"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "YcM3UUiE92ID"
	},
	"source": [
	"# Model Train\n",
	"# as an output you will get summary will\n",
	"learner.fit_onecycle(learning_rate, epochs)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "nhsWFWeSz-EP"
	},
	"source": [
	"# Prediction\n",
	"# pass The name of classes as you want \n",
	"classes = ['1', '2','3']\n",
	"predictor = ktrain.get_predictor(learner.model, preproc=t_mod)\n",
	"# pass the array or string as you like to test your data\n",
	"pred_class = predictor.predict(X_test['text'][67])\n",
	"print(\"Predicted Class: \", classes[pred_class])"
	],
	"execution_count": null,
	"outputs": []
	}
	]
	}