Skip to content

Instantly share code, notes, and snippets.

@r-sajal
Created October 18, 2020 13:03
Show Gist options
  • Save r-sajal/1b7ee74a4cf8279c5f5abf4fe601094d to your computer and use it in GitHub Desktop.
Save r-sajal/1b7ee74a4cf8279c5f5abf4fe601094d to your computer and use it in GitHub Desktop.
bert-ktrain-for-github.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "bert-ktrain-for-github.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyP+QTJjlfT+n6GmZ54XuFAM",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/r-sajal/1b7ee74a4cf8279c5f5abf4fe601094d/bert-ktrain-for-github.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "75jk41NMybZH"
},
"source": [
"# **Model 2 : Ktrain**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "43wVOY4gyX2r"
},
"source": [
"# Insatalling ktrain \n",
"# This may be a problem as pip changed some policies so I suggest to use Google Colab\n",
"pip install ktrain"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "_5sx-6jDy1WZ"
},
"source": [
"# Importing Genearal Libraries\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import sklearn\n",
"import re\n",
"import string as s\n",
"import warnings, gc\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"# Tensorflow\n",
"import tensorflow as tf\n",
"\n",
"# ktrain\n",
"import ktrain\n",
"from ktrain import text\n",
"\n",
"# sklearn\n",
"from sklearn.model_selection import train_test_split\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "09cGqDzzzLw5"
},
"source": [
"# importing data\n",
"# change read method according to dataset\n",
"train_sm_df = pd.read_json(\"/content/drive/My Drive/train_extra.json\")\n",
"test_df=pd.read_json(\"/content/drive/My Drive/embold_test.json\")"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "yKdkS8KezU_S"
},
"source": [
"# splitting data\n",
"target = ['label'] \n",
"data = ['text']\n",
"X = train_sm_df[data]\n",
"y = train_sm_df[target]\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "EvjjZUFuzukl"
},
"source": [
"# Transformer Model\n",
"model_ = 'bert-base-uncased'\n",
"t_mod = text.Transformer(model_, maxlen=100, classes = [0,1,2])\n",
"\n",
"\n",
"'''Converting split data to list [so it can processed]'''\n",
"#train\n",
"X_tr = X_train['text'].tolist()\n",
"y_tr = y_train['label'].tolist()\n",
"\n",
"#test\n",
"X_ts = X_test['text'].tolist()\n",
"y_ts = y_test['label'].tolist()\n",
"\n",
"\n",
"# Pre-processing training & test data\n",
"train = t_mod.preprocess_train(X_tr,y_tr)\n",
"test = t_mod.preprocess_train(X_ts,y_ts)\n",
"\n",
"# Model Classifier\n",
"model = t_mod.get_classifier()\n",
"\n",
"# increasing batch size helps in increasing speed of gpu as we can do parallel computation \n",
"# increasing too much will also lead to poor results as it would be taking average of all the local optimal points\n",
"# It may happen that local and optimal solution have high differences which will effect the avg value\n",
"# decreasing batch size too much may also give bad results : because it may lead to convergence at local optimal minima \n",
"# you have to tune this accordng to need\n",
"learner = ktrain.get_learner(model, train_data=train, val_data=test, batch_size=8) # preferable batch size = 8,16,"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "YcM3UUiE92ID"
},
"source": [
"# Model Train\n",
"# as an output you will get summary will\n",
"learner.fit_onecycle(learning_rate, epochs)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "nhsWFWeSz-EP"
},
"source": [
"# Prediction\n",
"# pass The name of classes as you want \n",
"classes = ['1', '2','3']\n",
"predictor = ktrain.get_predictor(learner.model, preproc=t_mod)\n",
"# pass the array or string as you like to test your data\n",
"pred_class = predictor.predict(X_test['text'][67])\n",
"print(\"Predicted Class: \", classes[pred_class])"
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment