Created
October 18, 2020 13:03
-
-
Save r-sajal/1b7ee74a4cf8279c5f5abf4fe601094d to your computer and use it in GitHub Desktop.
bert-ktrain-for-github.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "bert-ktrain-for-github.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyP+QTJjlfT+n6GmZ54XuFAM", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/r-sajal/1b7ee74a4cf8279c5f5abf4fe601094d/bert-ktrain-for-github.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "75jk41NMybZH" | |
}, | |
"source": [ | |
"# **Model 2 : Ktrain**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "43wVOY4gyX2r" | |
}, | |
"source": [ | |
"# Insatalling ktrain \n", | |
"# This may be a problem as pip changed some policies so I suggest to use Google Colab\n", | |
"pip install ktrain" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "_5sx-6jDy1WZ" | |
}, | |
"source": [ | |
"# Importing Genearal Libraries\n", | |
"\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import sklearn\n", | |
"import re\n", | |
"import string as s\n", | |
"import warnings, gc\n", | |
"warnings.filterwarnings(\"ignore\")\n", | |
"\n", | |
"# Tensorflow\n", | |
"import tensorflow as tf\n", | |
"\n", | |
"# ktrain\n", | |
"import ktrain\n", | |
"from ktrain import text\n", | |
"\n", | |
"# sklearn\n", | |
"from sklearn.model_selection import train_test_split\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "09cGqDzzzLw5" | |
}, | |
"source": [ | |
"# importing data\n", | |
"# change read method according to dataset\n", | |
"train_sm_df = pd.read_json(\"/content/drive/My Drive/train_extra.json\")\n", | |
"test_df=pd.read_json(\"/content/drive/My Drive/embold_test.json\")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "yKdkS8KezU_S" | |
}, | |
"source": [ | |
"# splitting data\n", | |
"target = ['label'] \n", | |
"data = ['text']\n", | |
"X = train_sm_df[data]\n", | |
"y = train_sm_df[target]\n", | |
"X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "EvjjZUFuzukl" | |
}, | |
"source": [ | |
"# Transformer Model\n", | |
"model_ = 'bert-base-uncased'\n", | |
"t_mod = text.Transformer(model_, maxlen=100, classes = [0,1,2])\n", | |
"\n", | |
"\n", | |
"'''Converting split data to list [so it can processed]'''\n", | |
"#train\n", | |
"X_tr = X_train['text'].tolist()\n", | |
"y_tr = y_train['label'].tolist()\n", | |
"\n", | |
"#test\n", | |
"X_ts = X_test['text'].tolist()\n", | |
"y_ts = y_test['label'].tolist()\n", | |
"\n", | |
"\n", | |
"# Pre-processing training & test data\n", | |
"train = t_mod.preprocess_train(X_tr,y_tr)\n", | |
"test = t_mod.preprocess_train(X_ts,y_ts)\n", | |
"\n", | |
"# Model Classifier\n", | |
"model = t_mod.get_classifier()\n", | |
"\n", | |
"# increasing batch size helps in increasing speed of gpu as we can do parallel computation \n", | |
"# increasing too much will also lead to poor results as it would be taking average of all the local optimal points\n", | |
"# It may happen that local and optimal solution have high differences which will effect the avg value\n", | |
"# decreasing batch size too much may also give bad results : because it may lead to convergence at local optimal minima \n", | |
"# you have to tune this accordng to need\n", | |
"learner = ktrain.get_learner(model, train_data=train, val_data=test, batch_size=8) # preferable batch size = 8,16," | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "YcM3UUiE92ID" | |
}, | |
"source": [ | |
"# Model Train\n", | |
"# as an output you will get summary will\n", | |
"learner.fit_onecycle(learning_rate, epochs)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "nhsWFWeSz-EP" | |
}, | |
"source": [ | |
"# Prediction\n", | |
"# pass The name of classes as you want \n", | |
"classes = ['1', '2','3']\n", | |
"predictor = ktrain.get_predictor(learner.model, preproc=t_mod)\n", | |
"# pass the array or string as you like to test your data\n", | |
"pred_class = predictor.predict(X_test['text'][67])\n", | |
"print(\"Predicted Class: \", classes[pred_class])" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment