ritog/copy-of-robi-poems.ipynb

## copy-of-robi-poems.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Copy of robi-poems.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU",
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "15cfa01ee073450886a880886604c562": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_9caabaee8a0a499f9db335174e2295d5",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_62462f8646464a38aa5fceb53af0c72b",
              "IPY_MODEL_724b8ea1b43647ba860325090ed99250"
            ]
          }
        },
        "9caabaee8a0a499f9db335174e2295d5": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "62462f8646464a38aa5fceb53af0c72b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_608a96da3b834b2dbee688614c041fbd",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 881,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 881,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_15031a8ce2c14dd8a1af5112abc4bc99"
          }
        },
        "724b8ea1b43647ba860325090ed99250": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_4557a319d0024a8f8d3d5e10137e8120",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 881/881 [00:08&lt;00:00, 107B/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_25fe9839fa41409fa1b8be4f9caa1dc4"
          }
        },
        "608a96da3b834b2dbee688614c041fbd": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "15031a8ce2c14dd8a1af5112abc4bc99": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "4557a319d0024a8f8d3d5e10137e8120": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "25fe9839fa41409fa1b8be4f9caa1dc4": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "2c2664b316e843d98e97ed6ddd78ad1d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_87b54d2be7e44a14b71880aa7f0977fa",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_0e26bf1b7ff64064858b55f64c13e5c9",
              "IPY_MODEL_10067a00aa7f46e895a11ea42c1a0c83"
            ]
          }
        },
        "87b54d2be7e44a14b71880aa7f0977fa": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "0e26bf1b7ff64064858b55f64c13e5c9": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_5e51cc0230f9477ea229a9c8cf0a72c5",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 1708956,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 1708956,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_b7b5ac4bfa484e05bfb65ecd0fdcabec"
          }
        },
        "10067a00aa7f46e895a11ea42c1a0c83": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_90fe734421514632b7455707fdb6647a",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 1.71M/1.71M [00:06&lt;00:00, 253kB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_fef095a5d6e047dcb79bff65a0b80124"
          }
        },
        "5e51cc0230f9477ea229a9c8cf0a72c5": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "b7b5ac4bfa484e05bfb65ecd0fdcabec": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "90fe734421514632b7455707fdb6647a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "fef095a5d6e047dcb79bff65a0b80124": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "580bb65ee3824889b5631568b99a58c8": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_aac5891278e84c5291e5b0164044116e",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_66d20b10edfe4958a39368199fcaad3b",
              "IPY_MODEL_306b0cfe44594bca8befbf05d6deff51"
            ]
          }
        },
        "aac5891278e84c5291e5b0164044116e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "66d20b10edfe4958a39368199fcaad3b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_072fc33d415b4421a069677ad61ff011",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 510401385,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 510401385,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_9e3c7fe1fff44b41af0798b7fe192f0f"
          }
        },
        "306b0cfe44594bca8befbf05d6deff51": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_94f04430f25f49beb366ab3ca728ec3a",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 510M/510M [00:13&lt;00:00, 36.7MB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_6f4cca4a80274eadaf20d28acb074d6a"
          }
        },
        "072fc33d415b4421a069677ad61ff011": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "9e3c7fe1fff44b41af0798b7fe192f0f": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "94f04430f25f49beb366ab3ca728ec3a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "6f4cca4a80274eadaf20d28acb074d6a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        }
      }
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/ghosh-r/757c32d5b5065a91e63beb017e38dba9/copy-of-robi-poems.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "dV9hhxEKF48s"
      },
      "source": [
        "%%capture\n",
        "! pip install datasets transformers"
      ],
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MZ13avB4Ebnw"
      },
      "source": [
        "# import re\n",
        "import json\n",
        "from sklearn.model_selection import train_test_split\n",
        "import pandas as pd"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 132,
          "referenced_widgets": [
            "15cfa01ee073450886a880886604c562",
            "9caabaee8a0a499f9db335174e2295d5",
            "62462f8646464a38aa5fceb53af0c72b",
            "724b8ea1b43647ba860325090ed99250",
            "608a96da3b834b2dbee688614c041fbd",
            "15031a8ce2c14dd8a1af5112abc4bc99",
            "4557a319d0024a8f8d3d5e10137e8120",
            "25fe9839fa41409fa1b8be4f9caa1dc4",
            "2c2664b316e843d98e97ed6ddd78ad1d",
            "87b54d2be7e44a14b71880aa7f0977fa",
            "0e26bf1b7ff64064858b55f64c13e5c9",
            "10067a00aa7f46e895a11ea42c1a0c83",
            "5e51cc0230f9477ea229a9c8cf0a72c5",
            "b7b5ac4bfa484e05bfb65ecd0fdcabec",
            "90fe734421514632b7455707fdb6647a",
            "fef095a5d6e047dcb79bff65a0b80124"
          ]
        },
        "id": "yGHBLbrGIWyV",
        "outputId": "eb5296d4-3223-4dbc-de2c-be5bb8c70128"
      },
      "source": [
        "from transformers import AutoTokenizer\n",
        "\n",
        "tokenizer = AutoTokenizer.from_pretrained(\"ghosh-r/bangla-gpt2\")\n",
        "\n",
        "train_path = 'train.txt'\n",
        "test_path = 'test.txt'"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "15cfa01ee073450886a880886604c562",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=881.0, style=ProgressStyle(description_…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "2c2664b316e843d98e97ed6ddd78ad1d",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1708956.0, style=ProgressStyle(descript…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "stream",
          "text": [
            "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1b4f-k8EIdXk",
        "outputId": "f18dc59f-21cf-4268-8622-65f56430b036"
      },
      "source": [
        "from transformers import TextDataset,DataCollatorForLanguageModeling\n",
        "\n",
        "def load_dataset(train_path,test_path,tokenizer):\n",
        "    train_dataset = TextDataset(\n",
        "          tokenizer=tokenizer,\n",
        "          file_path=train_path,\n",
        "          block_size=128)\n",
        "     \n",
        "    test_dataset = TextDataset(\n",
        "          tokenizer=tokenizer,\n",
        "          file_path=test_path,\n",
        "          block_size=128)   \n",
        "    \n",
        "    data_collator = DataCollatorForLanguageModeling(\n",
        "        tokenizer=tokenizer, mlm=False,\n",
        "    )\n",
        "    return train_dataset,test_dataset,data_collator\n",
        "\n",
        "train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/transformers/data/datasets/language_modeling.py:58: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_mlm.py\n",
            "  FutureWarning,\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 120,
          "referenced_widgets": [
            "580bb65ee3824889b5631568b99a58c8",
            "aac5891278e84c5291e5b0164044116e",
            "66d20b10edfe4958a39368199fcaad3b",
            "306b0cfe44594bca8befbf05d6deff51",
            "072fc33d415b4421a069677ad61ff011",
            "9e3c7fe1fff44b41af0798b7fe192f0f",
            "94f04430f25f49beb366ab3ca728ec3a",
            "6f4cca4a80274eadaf20d28acb074d6a"
          ]
        },
        "id": "FuETPslLIo4U",
        "outputId": "51042dd7-674a-4cbc-868c-e33dda336ea7"
      },
      "source": [
        "from transformers import Trainer, TrainingArguments,AutoModelWithLMHead\n",
        "\n",
        "model = AutoModelWithLMHead.from_pretrained(\"ghosh-r/bangla-gpt2\")\n",
        "\n",
        "\n",
        "training_args = TrainingArguments(\n",
        "    output_dir=\"./gpt2-robi_kobita\", #The output directory\n",
        "    overwrite_output_dir=True, #overwrite the content of the output directory\n",
        "    num_train_epochs=3, # number of training epochs\n",
        "    per_device_train_batch_size=32, # batch size for training\n",
        "    per_device_eval_batch_size=64,  # batch size for evaluation\n",
        "    eval_steps = 400, # Number of update steps between two evaluations.\n",
        "    save_steps=800, # after # steps model is saved \n",
        "    warmup_steps=500,# number of warmup steps for learning rate scheduler\n",
        "    prediction_loss_only=True,\n",
        "    )\n",
        "\n",
        "\n",
        "trainer = Trainer(\n",
        "    model=model,\n",
        "    args=training_args,\n",
        "    data_collator=data_collator,\n",
        "    train_dataset=train_dataset,\n",
        "    eval_dataset=test_dataset,\n",
        ")"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:847: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n",
            "  FutureWarning,\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "580bb65ee3824889b5631568b99a58c8",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=510401385.0, style=ProgressStyle(descri…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 347
        },
        "id": "j-t8_0pjZQiu",
        "outputId": "9283797a-2d02-4b69-bc37-2e665deeaf12"
      },
      "source": [
        "trainer.train()"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "***** Running training *****\n",
            "  Num examples = 6888\n",
            "  Num Epochs = 3\n",
            "  Instantaneous batch size per device = 32\n",
            "  Total train batch size (w. parallel, distributed & accumulation) = 32\n",
            "  Gradient Accumulation steps = 1\n",
            "  Total optimization steps = 648\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='648' max='648' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [648/648 06:08, Epoch 3/3]\n",
              "    </div>\n",
              "    <table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: left;\">\n",
              "      <th>Step</th>\n",
              "      <th>Training Loss</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <td>500</td>\n",
              "      <td>2.350000</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table><p>"
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n",
            "\n",
            "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
            "\n",
            "\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "TrainOutput(global_step=648, training_loss=2.242282631956501, metrics={'train_runtime': 369.4134, 'train_samples_per_second': 55.937, 'train_steps_per_second': 1.754, 'total_flos': 1974853779849216.0, 'train_loss': 2.242282631956501, 'epoch': 3.0})"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 10
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "jBRFaebLd9NA",
        "outputId": "adf8f2f6-1fb5-4966-cb5d-413f363d1884"
      },
      "source": [
        "trainer.save_model()"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Saving model checkpoint to ./gpt2-robi_kobita\n",
            "Configuration saved in ./gpt2-robi_kobita/config.json\n",
            "Model weights saved in ./gpt2-robi_kobita/pytorch_model.bin\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "PBW6W2D-d0HJ",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "a817202f-34e6-4e4d-ec17-bde31dedae4d"
      },
      "source": [
        "from transformers import pipeline\n",
        "\n",
        "poet = pipeline('text-generation',model='./gpt2-robi_kobita', tokenizer='ghosh-r/bangla-gpt2')"
      ],
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "loading configuration file ./gpt2-robi_kobita/config.json\n",
            "Model config GPT2Config {\n",
            "  \"_name_or_path\": \"ghosh-r/bangla-gpt2\",\n",
            "  \"activation_function\": \"gelu_new\",\n",
            "  \"architectures\": [\n",
            "    \"GPT2LMHeadModel\"\n",
            "  ],\n",
            "  \"attn_pdrop\": 0.0,\n",
            "  \"bos_token_id\": 50256,\n",
            "  \"embd_pdrop\": 0.0,\n",
            "  \"eos_token_id\": 50256,\n",
            "  \"gradient_checkpointing\": false,\n",
            "  \"initializer_range\": 0.02,\n",
            "  \"layer_norm_epsilon\": 1e-05,\n",
            "  \"model_type\": \"gpt2\",\n",
            "  \"n_ctx\": 1024,\n",
            "  \"n_embd\": 768,\n",
            "  \"n_head\": 12,\n",
            "  \"n_inner\": null,\n",
            "  \"n_layer\": 12,\n",
            "  \"n_positions\": 1024,\n",
            "  \"resid_pdrop\": 0.0,\n",
            "  \"scale_attn_weights\": true,\n",
            "  \"summary_activation\": null,\n",
            "  \"summary_first_dropout\": 0.1,\n",
            "  \"summary_proj_to_labels\": true,\n",
            "  \"summary_type\": \"cls_index\",\n",
            "  \"summary_use_proj\": true,\n",
            "  \"task_specific_params\": {\n",
            "    \"text-generation\": {\n",
            "      \"do_sample\": true,\n",
            "      \"max_length\": 50\n",
            "    }\n",
            "  },\n",
            "  \"torch_dtype\": \"float32\",\n",
            "  \"transformers_version\": \"4.8.2\",\n",
            "  \"use_cache\": true,\n",
            "  \"vocab_size\": 50257\n",
            "}\n",
            "\n",
            "loading configuration file ./gpt2-robi_kobita/config.json\n",
            "Model config GPT2Config {\n",
            "  \"_name_or_path\": \"ghosh-r/bangla-gpt2\",\n",
            "  \"activation_function\": \"gelu_new\",\n",
            "  \"architectures\": [\n",
            "    \"GPT2LMHeadModel\"\n",
            "  ],\n",
            "  \"attn_pdrop\": 0.0,\n",
            "  \"bos_token_id\": 50256,\n",
            "  \"embd_pdrop\": 0.0,\n",
            "  \"eos_token_id\": 50256,\n",
            "  \"gradient_checkpointing\": false,\n",
            "  \"initializer_range\": 0.02,\n",
            "  \"layer_norm_epsilon\": 1e-05,\n",
            "  \"model_type\": \"gpt2\",\n",
            "  \"n_ctx\": 1024,\n",
            "  \"n_embd\": 768,\n",
            "  \"n_head\": 12,\n",
            "  \"n_inner\": null,\n",
            "  \"n_layer\": 12,\n",
            "  \"n_positions\": 1024,\n",
            "  \"resid_pdrop\": 0.0,\n",
            "  \"scale_attn_weights\": true,\n",
            "  \"summary_activation\": null,\n",
            "  \"summary_first_dropout\": 0.1,\n",
            "  \"summary_proj_to_labels\": true,\n",
            "  \"summary_type\": \"cls_index\",\n",
            "  \"summary_use_proj\": true,\n",
            "  \"task_specific_params\": {\n",
            "    \"text-generation\": {\n",
            "      \"do_sample\": true,\n",
            "      \"max_length\": 50\n",
            "    }\n",
            "  },\n",
            "  \"torch_dtype\": \"float32\",\n",
            "  \"transformers_version\": \"4.8.2\",\n",
            "  \"use_cache\": true,\n",
            "  \"vocab_size\": 50257\n",
            "}\n",
            "\n",
            "loading weights file ./gpt2-robi_kobita/pytorch_model.bin\n",
            "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n",
            "\n",
            "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at ./gpt2-robi_kobita.\n",
            "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n",
            "Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
            "loading configuration file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/d916cda1fc7c33b497e404011afd7185469a7d6daf8d674e4bcbdb951e059768.0d65f2cde7eca11af939c04f8bd685073e094591ded3eb632f51dc0e29899bde\n",
            "Model config GPT2Config {\n",
            "  \"_name_or_path\": \"ghosh-r/bangla-gpt2\",\n",
            "  \"activation_function\": \"gelu_new\",\n",
            "  \"architectures\": [\n",
            "    \"GPT2LMHeadModel\"\n",
            "  ],\n",
            "  \"attn_pdrop\": 0.0,\n",
            "  \"bos_token_id\": 50256,\n",
            "  \"embd_pdrop\": 0.0,\n",
            "  \"eos_token_id\": 50256,\n",
            "  \"gradient_checkpointing\": false,\n",
            "  \"initializer_range\": 0.02,\n",
            "  \"layer_norm_epsilon\": 1e-05,\n",
            "  \"model_type\": \"gpt2\",\n",
            "  \"n_ctx\": 1024,\n",
            "  \"n_embd\": 768,\n",
            "  \"n_head\": 12,\n",
            "  \"n_inner\": null,\n",
            "  \"n_layer\": 12,\n",
            "  \"n_positions\": 1024,\n",
            "  \"resid_pdrop\": 0.0,\n",
            "  \"scale_attn_weights\": true,\n",
            "  \"summary_activation\": null,\n",
            "  \"summary_first_dropout\": 0.1,\n",
            "  \"summary_proj_to_labels\": true,\n",
            "  \"summary_type\": \"cls_index\",\n",
            "  \"summary_use_proj\": true,\n",
            "  \"task_specific_params\": {\n",
            "    \"text-generation\": {\n",
            "      \"do_sample\": true,\n",
            "      \"max_length\": 50\n",
            "    }\n",
            "  },\n",
            "  \"torch_dtype\": \"float32\",\n",
            "  \"transformers_version\": \"4.8.2\",\n",
            "  \"use_cache\": true,\n",
            "  \"vocab_size\": 50257\n",
            "}\n",
            "\n",
            "loading file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/vocab.json from cache at None\n",
            "loading file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/merges.txt from cache at None\n",
            "loading file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/8b3d4a6f9e80e0e5494bf8f1754de822dcdc06314d0994c9b68f7dbc1735cffa.9dfff3c35c0a768eb26244f921a71293e1adaa890f1468c982bad5568d6cd623\n",
            "loading file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/added_tokens.json from cache at None\n",
            "loading file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/special_tokens_map.json from cache at None\n",
            "loading file https://huggingface.co/ghosh-r/bangla-gpt2/resolve/main/tokenizer_config.json from cache at None\n",
            "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "SPzHEeeUeHJZ",
        "outputId": "aba9c81d-3f0e-4882-8ec8-ae95a2ce8c99"
      },
      "source": [
        "poem = poet('আমি তোমাকে দেখেছি মোর হৃদয়ে', max_length=200)"
      ],
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 69
        },
        "id": "oIOlTCTdxyKK",
        "outputId": "c9742aca-b5d4-4b6f-bac1-869183ae4e5a"
      },
      "source": [
        "poem[0]['generated_text']"
      ],
      "execution_count": 19,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'আমি তোমাকে দেখেছি মোর হৃদয়ের কুকুরের মতো।\\n\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0 মুহূর্তে সব নেভায়ে,\\n\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0 চিরদিন থাকে কেন আমার ঘরে?\\n\\xa0\\xa0\\xa0জীবন আমার হৃদয়ের পালঙ্কের একেবারে কাছে,\\n\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0আমার হৃদয়-সমুদ্র পারে।\\n\\xa0\\xa0\\xa0মনে হত আমার কত কত সংসারে\\n\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0\\xa0কত রাত্\\u200cরির কত বিহঙ্গ যেন হয়।\\n\\xa0\\xa0মনে হত পৃথিবীর কত মা যেন'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 19
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GMvtF1Vg2oBk"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}