mohammedkhalilia/train_flat_arabic_ner.ipynb Secret

## train_flat_arabic_ner.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyPMhCxt1IEvsn94kp3EylPp",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU",
    "gpuClass": "standard"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/mohammedkhalilia/72c3261734d7715094089bdf4de74b4a/train_flat_arabic_ner.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "BtmgHNnvrbNI"
      },
      "outputs": [],
      "source": [
        "# Verify that you have the GPU recognized\n",
        "!nvidia-smi"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Install dependencies\n",
        "!pip uninstall torch torchtext torchvision torchvision torchdata torchaudio\n",
        "!pip install torch==1.13.0\n",
        "!pip install transformers==4.24.0\n",
        "!pip install torchtext==0.14.0\n",
        "!pip install torchvision==0.14.0\n",
        "!pip install torchdata==0.5.1\n",
        "!pip install seqeval==1.2.2"
      ],
      "metadata": {
        "id": "dtO91IXuujsZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Remove existing package and clone again from Github\n",
        "!rm -rf /content/ArabicNER\n",
        "!git clone https://github.com/SinaLab/ArabicNER.git"
      ],
      "metadata": {
        "id": "jLHHpnFXr7-K"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Add the ArabicNER package to the system path\n",
        "import sys\n",
        "import argparse\n",
        "sys.path.append('/content/ArabicNER/')"
      ],
      "metadata": {
        "id": "o4EvIlcrssU6"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Import train function\n",
        "from arabiner.bin.train import main as train"
      ],
      "metadata": {
        "id": "-rM-8p4nsztp"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Setup the model arguments\n",
        "args_dict = {\n",
        "    # Model output path to save artifacts and model predictions\n",
        "    \"output_path\": \"/content/output/\",\n",
        "\n",
        "    # train/test/validation data paths\n",
        "    \"train_path\": \"/content/ArabicNER/data/train.txt\",\n",
        "    \"test_path\": \"/content/ArabicNER/data/test.txt\",\n",
        "    \"val_path\": \"/content/ArabicNER/data/val.txt\",\n",
        "\n",
        "    # seed for randomization\n",
        "    \"seed\": 1,\n",
        "\n",
        "    \"batch_size\": 8,\n",
        "\n",
        "    # Nmber of workers for the dataloader\n",
        "    \"num_workers\": 1,\n",
        "\n",
        "    # GPU/device Ids to train model on\n",
        "    # For two GPUs use [0, 1]\n",
        "    # For three GPUs use [0, 1, 2], etc.\n",
        "    \"gpus\": [0],\n",
        "\n",
        "    # Overwrite data in output_path directory specified above\n",
        "    \"overwrite\": True,\n",
        "\n",
        "    # How often to print the logs in terms of number of steps\n",
        "    \"log_interval\": 10,\n",
        "\n",
        "    # Data configuration\n",
        "    # Here we specify the dataset class and there are two options:\n",
        "    #  arabiner.data.datasets.DefaultDataset: for flat NER\n",
        "    #  arabiner.data.datasets.NestedTagsDataset: for nested NER\n",
        "    #\n",
        "    # kwargs: keyword arguments to the dataset class\n",
        "    # This notebook used the DefaultDataset for flat NER\n",
        "    \"data_config\": {\n",
        "        \"fn\": \"arabiner.data.datasets.DefaultDataset\",\n",
        "        \"kwargs\": {\"max_seq_len\": 512}\n",
        "    },\n",
        "\n",
        "    # Neural net configuration\n",
        "    # There are two NNs:\n",
        "    #   arabiner.nn.BertSeqTagger: flat NER tagger\n",
        "    #   arabiner.nn.BertNestedTagger: nested NER tagger\n",
        "    #\n",
        "    # kwargs: keyword arguments to the NN\n",
        "    # This notebook uses BertSeqTagger for flat NER tagging\n",
        "    \"network_config\": {\n",
        "        \"fn\": \"arabiner.nn.BertSeqTagger\",\n",
        "        \"kwargs\": {\"dropout\": 0.1, \"bert_model\": \"aubmindlab/bert-base-arabertv2\"}\n",
        "    },\n",
        "\n",
        "    # Model trainer configuration\n",
        "    #\n",
        "    #  arabiner.trainers.BertTrainer: for flat NER training\n",
        "    #  arabiner.trainers.BertNestedTrainer: for nested NER training\n",
        "    #\n",
        "    # kwargs: keyword arguments to arabiner.trainers.BertTrainer\n",
        "    #         additional arguments you can pass includes\n",
        "    #           - clip: for gradient clpping\n",
        "    #           - patience: number of epochs for early termination\n",
        "    # This notebook uses BertTrainer for fat NER training\n",
        "    \"trainer_config\": {\n",
        "        \"fn\": \"arabiner.trainers.BertTrainer\",\n",
        "        \"kwargs\": {\"max_epochs\": 50}\n",
        "    },\n",
        "\n",
        "    # Optimizer configuration\n",
        "    # Our experiments use torch.optim.AdamW, however, you are free to pass\n",
        "    # any other optmizers such as torch.optim.Adam or torch.optim.SGD\n",
        "    # lr: learning rate\n",
        "    # kwargs: keyword arguments to torch.optim.AdamW or whatever optimizer you use\n",
        "    #\n",
        "    # Additional optimizers can be found here:\n",
        "    # https://pytorch.org/docs/stable/optim.html\n",
        "    \"optimizer\": {\n",
        "        \"fn\": \"torch.optim.AdamW\",\n",
        "        \"kwargs\": {\"lr\": 0.0001}\n",
        "    },\n",
        "\n",
        "    # Learning rate scheduler configuration\n",
        "    # You can pass a learning scheduler such as torch.optim.lr_scheduler.StepLR\n",
        "    # kwargs: keyword arguments to torch.optim.AdamW or whatever scheduler you use\n",
        "    #\n",
        "    # Additional schedulers can be found here:\n",
        "    # https://pytorch.org/docs/stable/optim.html\n",
        "    \"lr_scheduler\": {\n",
        "        \"fn\": \"torch.optim.lr_scheduler.ExponentialLR\",\n",
        "        \"kwargs\": {\"gamma\": 1}\n",
        "    },\n",
        "\n",
        "    # Loss function configuration\n",
        "    # We use cross entropy loss\n",
        "    # kwargs: keyword arguments to torch.nn.CrossEntropyLoss or whatever loss function you use\n",
        "    \"loss\": {\n",
        "        \"fn\": \"torch.nn.CrossEntropyLoss\",\n",
        "        \"kwargs\": {}\n",
        "    }\n",
        "}\n",
        "\n",
        "# Convert args dictionary to argparse namespace\n",
        "args = argparse.Namespace()\n",
        "args.__dict__ = args_dict"
      ],
      "metadata": {
        "id": "UqqvY1fXtZpD"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Start training the model\n",
        "train(args)"
      ],
      "metadata": {
        "id": "EJQMuAC8taZB"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"authorship_tag": "ABX9TyPMhCxt1IEvsn94kp3EylPp",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "GPU",
	"gpuClass": "standard"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/mohammedkhalilia/72c3261734d7715094089bdf4de74b4a/train_flat_arabic_ner.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "BtmgHNnvrbNI"
	},
	"outputs": [],
	"source": [
	"# Verify that you have the GPU recognized\n",
	"!nvidia-smi"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# Install dependencies\n",
	"!pip uninstall torch torchtext torchvision torchvision torchdata torchaudio\n",
	"!pip install torch==1.13.0\n",
	"!pip install transformers==4.24.0\n",
	"!pip install torchtext==0.14.0\n",
	"!pip install torchvision==0.14.0\n",
	"!pip install torchdata==0.5.1\n",
	"!pip install seqeval==1.2.2"
	],
	"metadata": {
	"id": "dtO91IXuujsZ"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Remove existing package and clone again from Github\n",
	"!rm -rf /content/ArabicNER\n",
	"!git clone https://github.com/SinaLab/ArabicNER.git"
	],
	"metadata": {
	"id": "jLHHpnFXr7-K"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Add the ArabicNER package to the system path\n",
	"import sys\n",
	"import argparse\n",
	"sys.path.append('/content/ArabicNER/')"
	],
	"metadata": {
	"id": "o4EvIlcrssU6"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Import train function\n",
	"from arabiner.bin.train import main as train"
	],
	"metadata": {
	"id": "-rM-8p4nsztp"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Setup the model arguments\n",
	"args_dict = {\n",
	" # Model output path to save artifacts and model predictions\n",
	" \"output_path\": \"/content/output/\",\n",
	"\n",
	" # train/test/validation data paths\n",
	" \"train_path\": \"/content/ArabicNER/data/train.txt\",\n",
	" \"test_path\": \"/content/ArabicNER/data/test.txt\",\n",
	" \"val_path\": \"/content/ArabicNER/data/val.txt\",\n",
	"\n",
	" # seed for randomization\n",
	" \"seed\": 1,\n",
	"\n",
	" \"batch_size\": 8,\n",
	"\n",
	" # Nmber of workers for the dataloader\n",
	" \"num_workers\": 1,\n",
	"\n",
	" # GPU/device Ids to train model on\n",
	" # For two GPUs use [0, 1]\n",
	" # For three GPUs use [0, 1, 2], etc.\n",
	" \"gpus\": [0],\n",
	"\n",
	" # Overwrite data in output_path directory specified above\n",
	" \"overwrite\": True,\n",
	"\n",
	" # How often to print the logs in terms of number of steps\n",
	" \"log_interval\": 10,\n",
	"\n",
	" # Data configuration\n",
	" # Here we specify the dataset class and there are two options:\n",
	" # arabiner.data.datasets.DefaultDataset: for flat NER\n",
	" # arabiner.data.datasets.NestedTagsDataset: for nested NER\n",
	" #\n",
	" # kwargs: keyword arguments to the dataset class\n",
	" # This notebook used the DefaultDataset for flat NER\n",
	" \"data_config\": {\n",
	" \"fn\": \"arabiner.data.datasets.DefaultDataset\",\n",
	" \"kwargs\": {\"max_seq_len\": 512}\n",
	" },\n",
	"\n",
	" # Neural net configuration\n",
	" # There are two NNs:\n",
	" # arabiner.nn.BertSeqTagger: flat NER tagger\n",
	" # arabiner.nn.BertNestedTagger: nested NER tagger\n",
	" #\n",
	" # kwargs: keyword arguments to the NN\n",
	" # This notebook uses BertSeqTagger for flat NER tagging\n",
	" \"network_config\": {\n",
	" \"fn\": \"arabiner.nn.BertSeqTagger\",\n",
	" \"kwargs\": {\"dropout\": 0.1, \"bert_model\": \"aubmindlab/bert-base-arabertv2\"}\n",
	" },\n",
	"\n",
	" # Model trainer configuration\n",
	" #\n",
	" # arabiner.trainers.BertTrainer: for flat NER training\n",
	" # arabiner.trainers.BertNestedTrainer: for nested NER training\n",
	" #\n",
	" # kwargs: keyword arguments to arabiner.trainers.BertTrainer\n",
	" # additional arguments you can pass includes\n",
	" # - clip: for gradient clpping\n",
	" # - patience: number of epochs for early termination\n",
	" # This notebook uses BertTrainer for fat NER training\n",
	" \"trainer_config\": {\n",
	" \"fn\": \"arabiner.trainers.BertTrainer\",\n",
	" \"kwargs\": {\"max_epochs\": 50}\n",
	" },\n",
	"\n",
	" # Optimizer configuration\n",
	" # Our experiments use torch.optim.AdamW, however, you are free to pass\n",
	" # any other optmizers such as torch.optim.Adam or torch.optim.SGD\n",
	" # lr: learning rate\n",
	" # kwargs: keyword arguments to torch.optim.AdamW or whatever optimizer you use\n",
	" #\n",
	" # Additional optimizers can be found here:\n",
	" # https://pytorch.org/docs/stable/optim.html\n",
	" \"optimizer\": {\n",
	" \"fn\": \"torch.optim.AdamW\",\n",
	" \"kwargs\": {\"lr\": 0.0001}\n",
	" },\n",
	"\n",
	" # Learning rate scheduler configuration\n",
	" # You can pass a learning scheduler such as torch.optim.lr_scheduler.StepLR\n",
	" # kwargs: keyword arguments to torch.optim.AdamW or whatever scheduler you use\n",
	" #\n",
	" # Additional schedulers can be found here:\n",
	" # https://pytorch.org/docs/stable/optim.html\n",
	" \"lr_scheduler\": {\n",
	" \"fn\": \"torch.optim.lr_scheduler.ExponentialLR\",\n",
	" \"kwargs\": {\"gamma\": 1}\n",
	" },\n",
	"\n",
	" # Loss function configuration\n",
	" # We use cross entropy loss\n",
	" # kwargs: keyword arguments to torch.nn.CrossEntropyLoss or whatever loss function you use\n",
	" \"loss\": {\n",
	" \"fn\": \"torch.nn.CrossEntropyLoss\",\n",
	" \"kwargs\": {}\n",
	" }\n",
	"}\n",
	"\n",
	"# Convert args dictionary to argparse namespace\n",
	"args = argparse.Namespace()\n",
	"args.__dict__ = args_dict"
	],
	"metadata": {
	"id": "UqqvY1fXtZpD"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Start training the model\n",
	"train(args)"
	],
	"metadata": {
	"id": "EJQMuAC8taZB"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}